Okay, here it is. Attached is the regular expression patch. It
currently segfaults on Windows because of a combination of two factors:
1) There are some bounds-checking issues in key.c
2) Windows's malloc() isn't as robust as Unix's
This is only a problem on native Windows, not on Cygwin; I've confirmed
this myself.
Besides what you'd expect it to do, this patch makes a (very) minor
change to string.c. Basically, it makes it so that you can take a
zero-length substring with an index equal to the size of the string.
This is done so that the equivalent to "<$`><$&><$'>" doesn't need a
special case in the bytecode when $& reaches all the way to the end of
the string.
Copious documentation is included in rx.ops, and twenty tests are
included in t/op/rx.t. This patch is reliant on the ParrotPointer patch
I sent in earlier.
I can't quite guarantee that the patch will apply cleanly--I had to
manually change some things in it--but the things that may not apply
well should be easy to put in manually.
UNIMPLEMENTED OPCODES:
rx_compile - compile a regex
rx_cloneinfo - clone the info structure (used for look(ahead|behind)s)
UNTESTED OPCODES:
rx_forwards - tell the regex to increment the current index when moving
rx_backwards - tell the regex to decrement the current index when
moving
Share and enjoy.
--Brent Dax
[EMAIL PROTECTED]
Configure pumpking for Perl 6
<obra> mmmm. hawt sysadmin chx0rs
<lathos> This is sad. I know of *a* hawt sysamin chx0r.
<obra> I know more than a few.
<lathos> obra: There are two? Are you sure it's not the same one?
--- parrot-cvs/Makefile.in Wed Jan 9 02:51:00 2002
+++ parrot/Makefile.in Wed Jan 9 02:50:24 2002
@@ -63,7 +63,7 @@
$(INC)/global_setup.h $(INC)/vtable.h $(INC)/oplib/core_ops.h
$(INC)/oplib/core_ops_prederef.h \
$(INC)/runops_cores.h $(INC)/trace.h \
$(INC)/pmc.h $(INC)/key.h $(INC)/resources.h $(INC)/platform.h \
-$(INC)/interp_guts.h ${jit_h} ${jit_struct_h}
+$(INC)/interp_guts.h ${jit_h} ${jit_struct_h} $(INC)/rx.h
CLASS_O_FILES = classes/default$(O) classes/perlint$(O) classes/perlstring$(O) \
classes/perlnum$(O) classes/perlarray$(O) classes/perlundef$(O) \
@@ -79,7 +79,7 @@
INTERP_O_FILES = global_setup$(O) interpreter$(O) parrot$(O) register$(O) \
core_ops$(O) core_ops_prederef$(O) memory$(O) packfile$(O) stacks$(O) \
string$(O) encoding$(O) chartype$(O) runops_cores$(O) trace$(O) pmc$(O) key$(O) \
-platform$(O) ${jit_o} resources$(O)
+platform$(O) ${jit_o} resources$(O) rx$(O)
O_FILES = $(INTERP_O_FILES) $(IO_O_FILES) $(CLASS_O_FILES) $(ENCODING_O_FILES)
$(CHARTYPE_O_FILES)
@@ -292,6 +292,8 @@
register$(O): $(H_FILES)
+rx$(O): $(H_FILES)
+
stacks$(O): $(H_FILES)
core_ops$(O): $(H_FILES) core_ops.c
@@ -396,4 +398,3 @@
lint: ${test_prog}
$(LINT) ${cc_inc} -Iclasses $(LINTFLAGS) `echo $(O_FILES) | sed 's/\.o/\.c/g'`
$(LINT) ${cc_inc} $(LINTFLAGS) test_main.c
-
--- parrot-cvs/MANIFEST Wed Jan 9 02:25:54 2002
+++ parrot/MANIFEST Tue Jan 8 16:32:58 2002
@@ -109,6 +110,7 @@
include/parrot/pmc.h
include/parrot/register.h
include/parrot/resources.h
+include/parrot/rx.h
include/parrot/runops_cores.h
include/parrot/stacks.h
include/parrot/string.h
@@ -185,6 +187,8 @@
pmc_pm.pl
register.c
resources.c
+rx.c
+rx.ops
runops_cores.c
stacks.c
string.c
@@ -198,8 +202,8 @@
t/op/macro.t
t/op/number.t
t/op/pmc.t
t/op/pmc_perlhash.t
t/op/pmc_perlstring.t
+t/op/rx.t
t/op/stacks.t
t/op/string.t
t/op/time.t
--- /dev/null Wed Jan 9 03:01:05 2002
+++ /parrot/include/parrot/rx.h Tue Jan 8 15:20:58 2002
@@ -0,0 +1,92 @@
+/* rx.h
+ * Copyright: (When this is determined...it will go here)
+ * CVS Info
+ * $Id$
+ * Overview:
+ * Supporting file for the regular expression engine
+ * Data Structure and Algorithms:
+ * rxinfo is the main structure involved in regular expressions; it's stuffed
+ * into a Handle PMC and passed to all regular expression opcodes.
+ * History:
+ * Notes:
+ * References:
+ */
+
+#if !defined(PARROT_RX_H_GUARD)
+#define PARROT_RX_H_GUARD
+
+#include "parrot/parrot.h"
+
+typedef enum rxflags {
+ enum_rxflags_none=0,
+ enum_rxflags_case_insensitive=1,
+ enum_rxflags_single_line=2,
+ enum_rxflags_multiline=4,
+ enum_rxflags_reverse=8
+} rxflags;
+
+typedef enum rxdirection {
+ enum_rxdirection_forwards=1,
+ enum_rxdirection_backwards=-1
+} rxdirection;
+
+extern const INTVAL RX_MARK;
+
+typedef struct rxinfo {
+ STRING *string;
+ INTVAL index;
+ INTVAL startindex;
+ BOOLVAL success;
+
+ rxflags flags;
+ INTVAL minlength;
+ rxdirection whichway;
+
+ PMC *groupstart;
+ PMC *groupend;
+
+ opcode_t *substfunc;
+
+ struct Stack_Entry *stack_top;
+ struct StackChunk *stack_base;
+} rxinfo;
+
+rxinfo * rx_allocate_info(struct Parrot_Interp *, STRING *);
+
+BOOLVAL rx_is_word_character(char ch);
+BOOLVAL rx_is_number_character(char ch);
+BOOLVAL rx_is_whitespace_character(char ch);
+
+STRING *rxP_get_substr(struct Parrot_Interp *, STRING *, INTVAL, INTVAL);
+
+#define RX_dUNPACK(pmc) rxinfo *rx=(rxinfo *)pmc->data
+/* this one is really quite evil */
+#define RxCurChar(rx) ((char
+*)rx->string->bufstart)[rx->index]
+#define RxCurCharS(rx) rxP_get_substr(interpreter,
+rx->string, rx->index, 1)
+
+#define RxAdvance(rx) RxAdvanceX(rx, 1)
+#define RxAdvanceX(rx, x) rx->index += x * rx->whichway
+
+#define RxCaseInsensitive_on(rx) RxFlagOn(rx, enum_rxflags_case_insensitive)
+#define RxCaseInsensitive_off(rx) RxFlagOff(rx, enum_rxflags_case_insensitive)
+#define RxCaseInsensitive_test(rx) RxFlagTest(rx, enum_rxflags_case_insensitive)
+
+#define RxSingleLine_on(rx) RxFlagOn(rx, enum_rxflags_single_line)
+#define RxSingleLine_off(rx) RxFlagOff(rx, enum_rxflags_single_line)
+#define RxSingleLine_test(rx) RxFlagTest(rx, enum_rxflags_single_line)
+
+#define RxMultiline_on(rx) RxFlagOn(rx, enum_rxflags_multiline)
+#define RxMultiline_off(rx) RxFlagOff(rx, enum_rxflags_multiline)
+#define RxMultiline_test(rx) RxFlagTest(rx, enum_rxflags_multiline)
+
+#define RxReverse_on(rx) RxFlagOn(rx, enum_rxflags_reverse)
+#define RxReverse_off(rx) RxFlagOff(rx, enum_rxflags_reverse)
+#define RxReverse_test(rx) RxFlagTest(rx, enum_rxflags_reverse)
+
+#define RxFlagOn(rx, flag) (rx->flags |= flag)
+#define RxFlagOff(rx, flag) (rx->flags &= ~flag)
+#define RxFlagTest(rx, flag) (rx->flags & flag)
+
+#define RxFlagsOff(rx) rx->flags = enum_rxflags_none
+
+#endif
\ No newline at end of file
--- /dev/null Wed Jan 9 03:01:06 2002
+++ /parrot/rx.ops Tue Jan 8 16:16:54 2002
@@ -0,0 +1,1269 @@
+/*
+** rx.ops
+*/
+
+#include "parrot/rx.h"
+
+#define RxAssertMore(rx, branchto) if(rx->index >= string_length(rx->string)) { goto
+OFFSET(branchto); }
+
+VERSION = PARROT_VERSION;
+
+=head1 NAME
+
+rx.ops - Parrot Regular Expression Engine, version 3.0
+
+=head1 SYNOPSIS
+
+ # NOTE: This looks a LOT scarier than it really is
+ # "zzabbBBBBBBcdcdcdzz" =~ /ab*[cd]+/i
+ rx_allocateinfo P0, "zzabbBBBBBBcdcdcdzz"
+ bsr RX_0
+ rx_info_successful P0, I0
+ rx_freeinfo P0
+ if I0, $match
+ print "no "
+ $match:
+ print "match"
+ end
+
+ RX_0:
+ rx_setprops P0, "i", 2
+ branch $start0
+ $advance:
+ rx_advance P0, $fail
+ $start0:
+ rx_literal P0, "a", $advance
+
+ $start1:
+ rx_pushmark P0
+ $top1:
+ rx_literal P0, "b", $start2
+ rx_pushindex P0
+ branch $top1
+ $back1:
+ rx_popindex P0, $advance
+
+ $start2:
+ rx_literal P0, "cd", $back1
+ $top2:
+ rx_oneof P0, "cd", $succeed
+ branch $top2
+
+ $succeed:
+ rx_succeed P0
+ ret
+ $fail:
+ rx_fail P0
+ ret
+
+
+=head1 DESCRIPTION
+
+The Perl 5 regular expression engine was state-of-the-art. It was the fastest and
+most featureful implementation available. Everybody used Perl 5's regular expression
+syntax wherever possible.
+
+The Perl 5 regular expression engine was also a mess.
+
+The engine was like a separate interpreter unto itself. Few understood its dark
+magic,
+and fewer worked on its baroque source. It was a black box, sealed off from the
+outside
+world with only a couple opcodes to show in other files. It was the slowest part of
+Perl
+to adapt to new features--it was one of the last to get threadsafety and full Unicode
+support--because so few people understood it. Larry Wall once said that three people
+understood the regex engine, give or take four.
+
+Because of these issues, the design documents for Parrot called for regular
+expression
+opcodes to be built in to the interpreter. This group of opcodes, called the Parrot
+Regular Expression Engine version 3.0 (or simply Rx3), is the result.
+
+=head2 Basic Concepts
+
+Perl 5 had one opcode for each operation in the regular expression. For example:
+
+ >perl -mre=debug -e '/ab+[cd]/'
+ Compiling REx `ab+[cd]'
+ size 15 first at 1
+ 1: EXACT <a>(3)
+ 3: PLUS(6)
+ 4: EXACT <b>(0)
+ 6: ANYOF[cd](15)
+ 15: END(0)
+ anchored `ab' at 0 floating `b' at 1..2147483647 (checking anchored) minlen 3
+ Freeing REx: `ab+[cd]'
+
+(The C<re> pragma with the 'debug' switch displays the compiled version of the regex.
+The numbers in parenthesis represent where to jump to on success; 0 is a
+special value meaning "this part of the regex is done".)
+
+In Rx3, that regular expression would be something like:
+
+ $advance:
+ rx_advance P0, $fail
+ $start:
+ rx_literal P0, "ab", $advance
+ rx_pushmark P0
+ $top:
+ rx_pushindex P0
+ rx_literal P0, "b", $next
+ branch $top
+ $backtrack:
+ rx_popindex P0, $advance
+ $next:
+ rx_oneof P0, "cd", $backtrack
+ branch $success
+
+(In Rx3, the last parameter is a label to branch to on I<failure>, not success.)
+
+If you were insane enough to convert the labels to offsets, you'd get something like:
+
+ rx_advance P0, $fail
+ $start:
+ rx_literal P0, "ab", -6
+ rx_pushmark P0
+ rx_pushindex P0
+ rx_literal P0, "b", 6
+ branch -7
+ rx_popindex P0, -19
+ rx_oneof P0, "cd", -6
+ branch $success
+
+9 operations in Rx3 to 5 in Perl 5. I can already hear the cynicism: "how could
+that be BETTER?!?" Well, there's several reasons.
+
+The first is that it frees us to use normal ops, and in fact they're used all the
+time. C<branch> is a normal op; so is C<bsr>, the normal way to call a regular
+expression. Things like C<(?{CODE})> can be implemented with relative ease--simply
+put the normal opcodes in the appropriate place in the regex. If you're debugging
+a regex, you can simply sprinkle output messages liberally throughout the regex.
+
+The second is opcode dispatch. Parrot has very fast opcode dispatch, and we can use
+that to our advantage.
+
+Finally, there's the matter of optimizations. As an example, take C</a+bc+/>. The
+most efficient way to look for that is probably to look for the constant string 'abc'
+and expand outwards from there--especially if you use Boyer-Moore or another fast
+search algorithm. It means that the code generator can decide whether to optimize
+for success or failure, for compilation or execution speed. You get the idea.
+
+Bottom line is, Rx3 lays out exactly what's going on. This is a feature. It gives
+the
+regex compiler total control over what's going on.
+
+=head2 The Opcodes
+
+There are two basic rules to how the opcodes operate.
+
+The first involves the PMC that most take as their first parameter. This is a handle
+for
+an 'info' structure. The info structure accumulates data as it churns through the
+regex,
+such as the start and current indices and the start and end of each group. It also
+keeps
+track of things like the string we're matching against.
+
+The second rule pertains to the ops that have an integer constant as their last
+parameter.
+For the most part, these ops will branch to that parameter if the 'fail'. For most
+ops,
+'fail' means 'fail to match'.
+
+If the documentation for an op doesn't specifically mention the first or last
+parameter,
+that's what they are.
+
+The documentation for each opcode follows.
+
+=cut
+
+###############################################################################
+
+=head3 Preparation
+
+=over 4
+
+=cut
+
+########################################
+
+=item C<rx_allocateinfo>(p, p|s|sc)
+
+Allocates a new info structure and puts it into the first parameter. The second
+parameter
+is the string to match against.
+
+=cut
+
+op rx_allocinfo(out pmc, in str) {
+ rxinfo *rx=rx_allocate_info(interpreter, $2);
+
+ $1=pmc_new(interpreter, enum_class_ParrotPointer);
+
+ $1->data=(void*)rx;
+
+ goto NEXT();
+}
+
+op rx_allocinfo(out pmc, in pmc) {
+ rxinfo *rx=rx_allocate_info(interpreter, $2->vtable->get_string(interpreter,
+$2));
+
+ $1=pmc_new(interpreter, enum_class_ParrotPointer);
+
+ $1->data=(void*)rx;
+
+ goto NEXT();
+}
+
+########################################
+
+=item C<rx_freeinfo>(p)
+
+Deallocates the info structure in the first parameter and nulls out the handle.
+
+=cut
+
+op rx_freeinfo(inout pmc) {
+ mem_sys_free($1->data);
+ $1->data=NULL;
+
+ goto NEXT();
+}
+
+########################################
+
+=item C<rx_cloneinfo>(p)
+
+Clones the info structure in the first parameter. Make sure to save the original
+structure in another register, the stack, or a symbol table entry before calling this
+opcode.
+
+B<XXX> Currently this op has not been implemented.
+
+=cut
+
+op rx_cloneinfo(inout pmc) {
+ RX_dUNPACK($1);
+
+ goto NEXT();
+}
+
+########################################
+
+=item C<rx_compile>(i, s|sc, s|sc)
+
+Provides a built-in regular expression compiler. The first parameter is set to the
+address of the newly-compiled regex, which can then be C<jsr>'ed to; the second
+parameter is the regex itself; and the third parameter is the modifiers on the regex.
+
+B<XXX> Currently this op has not been implemented.
+
+=cut
+
+op rx_compile(out str, in str, in str) {
+ $1=0;
+
+ goto NEXT();
+}
+
+###############################################################################
+
+=back
+
+=head3 Info accessor ops
+
+=over 4
+
+=cut
+
+########################################
+
+=item C<rx_info_successful>(p, i)
+
+If the info structure indicates the match was successful, sets the second parameter
+to true; otherwise sets it to false.
+
+=cut
+
+op rx_info_successful(in pmc, out int) {
+ RX_dUNPACK($1);
+
+ $2=rx->success;
+
+ goto NEXT();
+}
+
+########################################
+
+=item C<rx_info_getindex>(p, i)
+
+Retrieves the current index stored in the info structure. If the match has already
+finished successfully, this will be the index of the end of the match.
+
+=cut
+
+op rx_info_getindex(in pmc, out int) {
+ RX_dUNPACK($1);
+
+ $2=rx->index;
+
+ goto NEXT();
+}
+
+########################################
+
+=item C<rx_info_getstartindex>(p, i)
+
+Gets the index the match started at.
+
+Note that if a regex uses the C<rx_backwards(p)> op, the start and end indices may be
+reversed.
+
+=cut
+
+op rx_info_getstartindex(in pmc, out int) {
+ RX_dUNPACK($1);
+
+ $2=rx->startindex;
+
+ goto NEXT();
+}
+
+########################################
+
+=item C<rx_info_getgroup>(in pmc, out int, out int, in int)
+
+Gets the start and end indices of the group indicated by the fourth parameter.
+
+=cut
+
+op rx_info_getgroup(in pmc, out int, out int, in int) {
+ RX_dUNPACK($1);
+
+ $2=rx->groupstart->vtable->get_integer_index(interpreter, rx->groupstart, $4);
+ $3=rx->groupend->vtable->get_integer_index(interpreter, rx->groupend, $4);
+
+ goto NEXT();
+}
+
+###############################################################################
+
+=back
+
+=head3 Stack manipulation ops
+
+=over 4
+
+=cut
+
+########################################
+
+=item C<rx_pushindex>(p)
+
+Pushes the current index onto the stack contained in the info structure.
+
+=cut
+
+op rx_pushindex(in pmc) {
+ RX_dUNPACK($1);
+
+ push_generic_entry(interpreter, &rx->stack_top, &rx->index, STACK_ENTRY_INT,
+NULL);
+
+ goto NEXT();
+}
+
+########################################
+
+=item C<rx_pushmark>(p)
+
+Pushes a 'mark' onto the stack contained in the info structure. Marks are used
+to indicate where one operation's backtrack information ends and another's begins.
+
+=cut
+
+op rx_pushmark(in pmc) {
+ RX_dUNPACK($1);
+
+ /* Don't worry about the const warning from the next line */
+ push_generic_entry(interpreter, &rx->stack_top, &RX_MARK, STACK_ENTRY_INT,
+NULL);
+
+ goto NEXT();
+}
+
+########################################
+
+=item C<rx_popindex>(p, ic)
+
+Pops an index off the stack. If it pops a mark off instead, it branches to the
+second parameter.
+
+=cut
+
+op rx_popindex(in pmc, in int) {
+ RX_dUNPACK($1);
+ int i;
+
+ pop_generic_entry(interpreter, &rx->stack_top, &i, STACK_ENTRY_INT);
+
+ if(i==RX_MARK) {
+ goto OFFSET($2);
+ }
+ else {
+ rx->index=i;
+ goto NEXT();
+ }
+}
+
+###############################################################################
+
+=back
+
+=head3 Directional ops
+
+=over 4
+
+=cut
+
+########################################
+
+=item C<rx_forwards>(p)
+
+Indicates that the regex should increment the index as it moves through the string.
+
+=cut
+
+op rx_forwards(in pmc) {
+ RX_dUNPACK($1);
+
+ rx->whichway=enum_rxdirection_forwards;
+
+ goto NEXT();
+}
+
+
+########################################
+
+=item C<rx_backwards>(p)
+
+Indicates that the regex should decrement the index as it moves through the string.
+This is different from reversed regexes (see L</"rx_setprops(p, sc, ic)">); reversed
+affects the start index, while backwards affects the end index.
+
+=cut
+
+op rx_backwards(in pmc) {
+ RX_dUNPACK($1);
+
+ rx->whichway=enum_rxdirection_backwards;
+
+ goto NEXT();
+}
+
+###############################################################################
+
+=back
+
+=head3 Matching ops
+
+=over 4
+
+=cut
+
+########################################
+
+=item C<rx_advance>(p, ic)
+
+Increments (or decrements, if the C<r> modifier is used) the start index one
+character. Branches to the second parameter if it goes past the end of the string.
+
+=cut
+
+op rx_advance(in pmc, in int) {
+ RX_dUNPACK($1);
+
+ if(!RxReverse_test(rx)) {
+ if(++rx->startindex + rx->minlength > string_length(rx->string)) {
+ goto OFFSET($2);
+ }
+ }
+ else {
+ if(--rx->startindex < 0) {
+ goto OFFSET($2);
+ }
+ }
+
+ rx->index=rx->startindex;
+
+ while(stack_depth(interpreter, rx->stack_base)) {
+ pop_generic_entry(interpreter, &rx->stack_top, NULL, STACK_ENTRY_INT);
+ }
+
+ goto NEXT();
+}
+
+########################################
+
+=item C<rx_incrindex>(p, ic)
+
+Increments the current index (or decrements, if C<rx_backwards> is used) by the
+amount in the second parameter. Does I<not> check if it's gone past the end of the
+string.
+
+=cut
+
+op rx_incrindex(in pmc, in int) {
+ RX_dUNPACK($1);
+ RxAdvanceX(rx, $2);
+
+ goto NEXT();
+}
+
+########################################
+
+=item C<rx_setprops>(p, sc, ic)
+
+Sets certain properties in the info structure. The second parameter is a string
+containing one or more of the following characters:
+
+=over 4
+
+=item C<i>
+
+Sets case-insensitive matching.
+
+=item C<s>
+
+Sets single-line matching; the C<rx_dot> op will match newlines with this turned on.
+
+=item C<m>
+
+Sets multiline matching; the C<rx_zwa_atbeginning> and C<rx_zwa_atend> opcodes will
+match the beginning and end of lines.
+
+=item C<r>
+
+Sets reverse or right matching; match starts at the end of the string and inches
+towards the beginning.
+
+=back
+
+The third parameter is the minimum length the string would need to be for a match to
+be possible. For example, in the match C</ba*r+/>, the minimum length is 2.
+
+=cut
+
+op rx_setprops(in pmc, in str, in int) {
+ int i;
+ RX_dUNPACK($1);
+
+ rx->minlength=$3;
+
+ for(i=0; i < string_length($2); i++) {
+ switch(((char *)$2->bufstart)[i]) {
+ case 'i':
+ RxCaseInsensitive_on(rx);
+ break;
+ case 's':
+ RxSingleLine_on(rx);
+ break;
+ case 'm':
+ RxMultiline_on(rx);
+ break;
+ case 'r':
+ RxReverse_on(rx);
+ rx->index=rx->startindex=string_length(rx->string);
+ break;
+ default:
+ fprintf(stderr, "Unknown regular expression option
+'%c'.", ((char*)$2->bufstart)[i]);
+ HALT();
+ }
+ }
+
+ goto NEXT();
+}
+
+########################################
+
+=item C<rx_startgroup>(p, ic)
+
+Indicates that the current index is the start index of the group number indicated in
+the second parameter.
+
+=cut
+
+op rx_startgroup(in pmc, in int) {
+ RX_dUNPACK($1);
+
+ rx->groupstart->vtable->set_integer_index(interpreter, rx->groupstart,
+rx->index, $2);
+
+ goto NEXT();
+}
+
+########################################
+
+=item C<rx_endgroup>(p, ic)
+
+Indicates that the current index is the end index of the group number indicated in
+the second parameter.
+
+=cut
+
+op rx_endgroup(in pmc, in int) {
+ RX_dUNPACK($1);
+
+ rx->groupend->vtable->set_integer_index(interpreter, rx->groupend, rx->index,
+$2);
+
+ goto NEXT();
+}
+
+########################################
+
+=item C<rx_literal>(p, in str, ic)
+
+Matches the exact string (sensitive to the C<i> modifier) passed in the second
+parameter.
+
+B<XXX> Currently does not honor the C<i> modifier.
+
+=cut
+
+op rx_literal(in pmc, in str, in int) {
+ RX_dUNPACK($1);
+ STRING *targ;
+
+ if(string_length(rx->string) < rx->index+string_length($2)) {
+ goto OFFSET($3);
+ }
+
+ targ=rxP_get_substr(interpreter, rx->string, rx->index, string_length($2));
+
+ if(string_compare(interpreter, $2, targ)==0) {
+ RxAdvanceX(rx, string_length($2));
+ }
+ else {
+ goto OFFSET($3);
+ }
+
+ goto NEXT();
+}
+
+########################################
+
+=item C<rx_is_w>(p, ic)
+
+Matches a word character (usually C<\w>).
+
+=cut
+
+op rx_is_w(in pmc, in int) {
+ RX_dUNPACK($1);
+
+ RxAssertMore(rx, $2);
+
+ if(rx_is_word_character(RxCurChar(rx))) {
+ RxAdvance(rx);
+ goto NEXT();
+ }
+ else {
+ goto OFFSET($2);
+ }
+}
+
+########################################
+
+=item C<rx_is_n>(p, ic)
+
+Matches a number character (usually C<\n>).
+
+=cut
+
+op rx_is_n(in pmc, in int) {
+ RX_dUNPACK($1);
+
+ RxAssertMore(rx, $2);
+
+ if(rx_is_number_character(RxCurChar(rx))) {
+ RxAdvance(rx);
+ goto NEXT();
+ }
+ else {
+ goto OFFSET($2);
+ }
+}
+
+########################################
+
+=item C<rx_is_s>(p, ic)
+
+Matches a whitespace character (usually C<\s>).
+
+=cut
+
+op rx_is_s(in pmc, in int) {
+ RX_dUNPACK($1);
+
+ RxAssertMore(rx, $2);
+
+ if(rx_is_whitespace_character(RxCurChar(rx))) {
+ RxAdvance(rx);
+ goto NEXT();
+ }
+ else {
+ goto OFFSET($2);
+ }
+
+}
+
+########################################
+
+=item C<rx_oneof>(p, sc, ic)
+
+Matches if the current character is one of the characters in the second parameter.
+Sensitive to the C<i> modifier.
+
+This op requires that its input be sorted for efficiency. Further, it requires that
+all
+ranges (C<a-z>) be expanded by the regex compiler.
+
+B<XXX> Currently does not honor the C<i> modifier.
+
+=cut
+
+op rx_oneof(in pmc, in str, in int) {
+ RX_dUNPACK($1);
+ STRING *ch1;
+ STRING *ch2;
+ INTVAL i;
+
+ /* XXX In the future, this ought to use bitmaps. */
+
+ RxAssertMore(rx, $3);
+
+ ch1=RxCurCharS(rx);
+
+ if(string_length($2) < 8) { /* XXX run benchmarks to find a good value */
+ /* modified linear search--slow, but zero overhead */
+ for(i=0; i < string_length($2); i++) {
+ ch2=rxP_get_substr(interpreter, $2, i, 1);
+
+ if(string_compare(interpreter, ch1, ch2)==0) {
+ RxAdvance(rx);
+ goto NEXT();
+ }
+ else if(string_compare(interpreter, ch1, ch2) < 0) {
+ goto OFFSET($3);
+ }
+ }
+ }
+ else {
+ /* binary search--fast but complicated */
+ INTVAL upper, lower=0, index=0, lastindex=-1, cmp;
+
+ upper=string_length($2);
+
+ while(upper > lower) {
+ index=(upper+lower)/2;
+
+ if(index==lastindex) {
+ goto OFFSET($3);
+ }
+ else if(index==string_length($2)) {
+ goto OFFSET($3);
+ }
+
+ cmp=string_compare(interpreter, RxCurCharS(rx),
+rxP_get_substr(interpreter, $2, index, 1));
+
+ if(0==cmp) {
+ RxAdvance(rx);
+ goto NEXT();
+ }
+ else if(0 > cmp) {
+ upper=index;
+ }
+ else {
+ lower=index;
+ }
+
+ lastindex=index;
+ }
+ }
+
+ goto OFFSET($3);
+}
+
+########################################
+
+=item C<rx_dot>(p, ic)
+
+Matches any character except a newline (C<\n>). (If the C<s> modifier is used,
+matches any character at all.)
+
+=cut
+
+op rx_dot(in pmc, in int) {
+ RX_dUNPACK($1);
+
+ RxAssertMore(rx, $2);
+
+ if(RxSingleLine_test(rx)) {
+ RxAdvance(rx);
+ goto NEXT();
+ }
+ else {
+ STRING *ch=RxCurCharS(rx);
+ STRING *nl=string_make(interpreter, "\n", 1, 0, 0, 0);
+
+ if(string_compare(interpreter, ch, nl)!=0) {
+ RxAdvance(rx);
+ goto NEXT();
+ }
+ else {
+ goto OFFSET($2);
+ }
+ }
+}
+
+########################################
+
+=item C<rx_zwa_boundary>(p, ic)
+
+Matches if the one of the previous character and the next character is a word
+character, and the other one is not (usually C<\b>).
+
+=cut
+
+op rx_zwa_boundary(in pmc, in int) {
+ RX_dUNPACK($1);
+ char ch1, ch2;
+
+ ch1=RxCurChar(rx);
+ RxAdvanceX(rx, -1);
+ ch2=RxCurChar(rx);
+ RxAdvance(rx);
+
+ if(rx_is_word_character(ch1) == rx_is_word_character(ch2)) {
+ goto OFFSET($2);
+ }
+
+ goto NEXT();
+}
+
+########################################
+
+=item C<rx_zwa_atbeginning>(p, ic)
+
+Matches at the beginning of the string. If the C<m> modifier is used, matches at the
+beginning of any line.
+
+B<XXX> Currently does not honor the C<m> modifier.
+
+=cut
+
+op rx_zwa_atbeginning(in pmc, in int) {
+ RX_dUNPACK($1);
+
+ if(rx->index != 0) {
+ goto OFFSET($2);
+ }
+
+ goto NEXT();
+}
+
+########################################
+
+=item C<rx_zwa_atend>(p, ic)
+
+Matches at the end of the string. If the C<m> modifier is used, matches at the
+end of any line.
+
+B<XXX> Currently does not honor the C<m> modifier.
+
+=cut
+
+op rx_zwa_atend(in pmc, in int) {
+ RX_dUNPACK($1);
+
+ if(rx->index != string_length(rx->string)) {
+ goto OFFSET($2);
+ }
+
+ goto NEXT();
+}
+
+########################################
+
+=item C<rx_succeed>(p)
+
+Modifies the info structure to indicate that the match succeeded.
+
+=cut
+
+op rx_succeed(in pmc) {
+ RX_dUNPACK($1);
+
+ rx->success=1;
+
+ goto NEXT();
+}
+
+########################################
+
+=item C<rx_fail>(p)
+
+Modifies the info structure to indicate that the match failed.
+
+=cut
+
+op rx_fail(in pmc) {
+ RX_dUNPACK($1);
+
+ rx->success=0;
+
+ goto NEXT();
+}
+
+###############################################################################
+
+=back
+
+=head2 Using the opcodes
+
+=head3 Tutorial
+
+Now that you've seen all the opcodes available, you'll probably want to know how to
+use
+them.
+
+To do so, we'll walk you through the building of a regular expression. For this
+example,
+we'll use the expression C</ab*[cd]+/i>. (This is the same expression written out in
+L</SYNOPSIS>.)
+
+The first step is to break it up into term-quantifier pairs. In this case:
+
+ RX_0:
+ a
+ b*
+ [cd]+
+
+
+Next, we'll figure out how to match each term.
+
+ RX_0:
+ rx_literal P0, "a", ...
+ rx_literal P0, "b", ...
+ rx_oneof P0, "cd", ...
+
+The elipses will be filled in later.
+
+Now, we need to figure out how to represent the quantifiers. We end up with
+something
+like this:
+
+ RX_0:
+ rx_literal P0, "a", ...
+
+ $top1:
+ rx_literal P0, "b", ...
+ branch $top1
+
+ rx_oneof P0, "cd", ...
+ $top2:
+ rx_oneof P0, "cd", ...
+ branch $top2
+
+Note that C<[cd]+> is equivalent to C<[cd][cd]*>. We take advantage of this fact.
+
+Next, we add C<rx_pushmark> ops at the boundaries between quantifiers, and
+C<rx_pushindex> ops within the quantifiers themselves.
+
+ RX_0:
+ rx_literal P0, "a", ...
+
+ rx_pushmark P0
+ $top1:
+ rx_literal P0, "b", ...
+ rx_pushindex P0
+ branch $top1
+
+ rx_pushmark P0
+ $top2:
+ rx_oneof P0, "cd", ...
+ rx_pushindex P0
+ branch $top2
+
+Next, we add the backtracking code. Backtracking is usually done one way.
+
+ RX_0:
+ rx_literal P0, "a", ...
+ $back0:
+ branch ...
+
+ rx_pushmark P0
+ $top1:
+ rx_literal P0, "b", ...
+ rx_pushindex P0
+ branch $top1
+ $back1:
+ rx_popindex P0, ...
+
+ rx_pushmark P0
+ $top2:
+ rx_oneof P0, "cd", ...
+ rx_pushindex P0
+ branch $top2
+ $back2:
+ rx_popindex P0, ...
+
+After that, we'll add some skeleton code.
+
+ RX_0:
+ rx_setprops P0, "i", 2
+ branch $start0
+ $advance:
+ rx_advance P0, ...
+
+ $start0:
+ rx_literal P0, "a", ...
+ $back0:
+ branch ...
+ #if there's no quantifier, you just fall back to the
+ # previous backtrack
+
+ rx_pushmark P0
+ $top1:
+ rx_literal P0, "b", ...
+ rx_pushindex P0
+ branch $top1
+ $back1:
+ rx_popindex P0, ...
+
+ rx_pushmark P0
+ rx_oneof P0, "cd", ...
+ $top2:
+ rx_oneof P0, "cd", ...
+ rx_pushindex P0
+ branch $top2
+ $back2:
+ rx_popindex P0, ...
+
+ rx_succeed P0
+ ret
+ $fail:
+ rx_fail P0
+ ret
+
+Now that that's done, we'll connect the dots (literally).
+
+ RX_0:
+ rx_setprops P0, "i", 2
+ branch $start0
+ $advance:
+ rx_advance P0, $fail
+ $start0:
+ rx_literal P0, "a", $advance
+ $back0:
+ branch $advance
+
+ $start1:
+ rx_pushmark P0
+ $top1:
+ #when you're looping in a quantifier and a match operation fails,
+ # you should simply move on to the next step of the match.
+ rx_literal P0, "b", $start2
+ rx_pushindex P0
+ branch $top1
+ $back1:
+ rx_popindex P0, $back0
+
+ $start2:
+ rx_literal P0, "cd", $back1
+ rx_pushmark P0
+ $top2:
+ rx_oneof P0, "cd", $succeed
+ rx_pushindex P0
+ branch $top2
+ $back2:
+ rx_popindex P0, $back1
+
+ $succeed:
+ rx_succeed P0
+ ret
+ $fail:
+ rx_fail P0
+ ret
+
+Notice how C<$advance> serves as a sort of C<$back-1>--it serves as the default
+"backtracker". Note also how only the C<rx_advance> uses the C<$fail> label--that's
+how
+it backtracks.
+
+The final step is optimization. In this case, there are two things we can optimize.
+First of all, that C<$back0> is pointless--we're better off just deleting it and
+branching
+to C<$advance> directly. Second, C<$back2> will I<never> be called, so we can get
+rid of
+it too.
+
+ RX_0:
+ rx_setprops P0, "i", 2
+ branch $start0
+ $advance:
+ rx_advance P0, $fail
+ $start0:
+ rx_literal P0, "a", $advance
+
+ $start1:
+ rx_pushmark P0
+ $top1:
+ rx_literal P0, "b", $start2
+ rx_pushindex P0
+ branch $top1
+ $back1:
+ rx_popindex P0, $advance
+
+ $start2:
+ rx_literal P0, "cd", $back1
+ $top2:
+ rx_oneof P0, "cd", $succeed
+ branch $top2
+
+ $succeed:
+ rx_succeed P0
+ ret
+ $fail:
+ rx_fail P0
+ ret
+
+We've now written the regular expression itself; the one thing left to do is write
+the
+code that calls it. Let's say the Perl code looks like this:
+
+ unless("zzabbBBBBBBcdcdcdzz" =~ /ab*[cd]+/i) {
+ print "no ";
+ }
+
+ print "match";
+
+Then the Parrot code would be something like this:
+
+ rx_allocateinfo P0, "zzabbBBBBBBcdcdcdzz"
+ bsr RX_0
+ rx_info_successful P0, I0
+ rx_freeinfo P0
+ if I0, $match
+ print "no "
+ $match:
+ print "match"
+ end
+
+Congratulations--you've now written your first regular expression with Rx3. That
+wasn't
+so hard, now was it?
+
+=head3 Common constructs
+
+The list below gives simple templates for common quantifiers operations.
+
+=over 4
+
+=item C<x*>
+
+ $start:
+ rx_pushmark P0
+ $loop:
+ rx_pushindex P0
+ rx_literal P0, "x", $next
+ branch $loop
+ $back:
+ rx_popindex P0, $lastback
+ branch $next
+
+=item C<x+>
+
+ $start:
+ rx_literal P0, "x", $lastback
+ rx_pushmark P0
+ $loop:
+ rx_pushindex P0
+ rx_literal P0, $next
+ branch $loop
+ $back:
+ rx_popindex P0, $lastback
+ branch $next
+
+=item C<x?>
+
+ $start:
+ rx_pushmark P0
+ rx_literal P0, "x", $next
+ rx_pushindex P0
+ branch $next
+ $back:
+ rx_popindex P0, $lastback
+ branch $next
+
+=item C<x*?>
+
+ $start:
+ branch $next
+ $back:
+ rx_literal P0, "x", $lastback
+ branch $next
+
+=item C<x+?>
+
+ $start:
+ rx_literal P0, "x", $lastback
+ branch $next
+ $back:
+ rx_literal P0, "x", $lastback
+ branch $next
+
+=item C<x??>
+
+ $start:
+ set I0, 0 #I0 used to make sure we haven't backtracked before
+ branch $next
+ $back:
+ if I0, $lastback
+ rx_literal P0, "x", $lastback
+ branch $next
+
+=item C<x|y>
+
+ $start:
+ rx_pushmark P0
+ rx_pushindex P0
+ rx_literal P0, "x", $nextalt
+ branch $next
+ $nextalt:
+ rx_popindex P0
+ rx_literal P0, "x", $back #no, that's not a typo
+ branch $next
+ $back:
+ rx_popmark P0
+ branch $lastback
+
+=item C<(?=x)>
+
+ set P1, P0
+ rx_cloneinfo P0
+
+ rx_literal P0, "x", $lastback
+
+ set P0, P1
+
+=back
+
+B<XXX> Finish this documentation.
+
+=head1 BUGS
+
+=over 4
+
+=item *
+
+This code currently requires everything to be in an eight-bit encoding compatible
+with ASCII.
+
+=item *
+
+Many modifiers are not currently respected.
+
+=item *
+
+There are undoubtably many more in code this complicated.
+
+=back
+
+=head1 AUTHORS
+
+Copyright (C) 2001-2002 The Parrot Team <[EMAIL PROTECTED]>.
+
+Initial version by Brent Dax <[EMAIL PROTECTED]>; special thanks to Angel
+Faus <[EMAIL PROTECTED]> and Jeff 'japhy' Pinyan <[EMAIL PROTECTED]> for major help,
+especially with decisions on the architecture of the engine.
+
+=cut
\ No newline at end of file
--- /dev/null Wed Jan 9 03:01:06 2002
+++ /parrot/t/op/rx.t Mon Jan 7 22:31:14 2002
@@ -0,0 +1,203 @@
+use Parrot::Test tests => 20;
+
+sub gentest($$;$$) {
+ $_[2] ||= "";
+ $_[3] ||= 0;
+
+ return <<"END";
+ set S0, "$_[0]"
+ rx_allocinfo P0, S0
+ bsr RX_0
+ rx_info_successful P0, I0
+ if I0, \$yup
+ print "no match\\n"
+ end
+ \$yup:
+ rx_info_getstartindex P0, I1
+ rx_info_getindex P0, I2
+ length I3, S0
+
+ rx_freeinfo P0
+
+ substr S1, S0, 0, I1
+ sub I4, I2, I1
+ substr S2, S0, I1, I4
+ sub I4, I3, I2
+ substr S3, S0, I2, I4
+
+ print "<"
+ print S1
+ print "><"
+ print S2
+ print "><"
+ print S3
+ print ">\\n"
+
+ end
+
+ RX_0:
+ rx_setprops P0, "$_[2]", $_[3]
+ branch \$start
+ \$advance:
+ rx_advance P0, \$fail
+ \$start:
+ $_[1]
+
+ rx_succeed P0
+ ret
+ \$fail:
+ rx_fail P0
+ ret
+END
+}
+
+output_is(gentest('a', <<'CODE'), <<'OUTPUT', 'A is A');
+ rx_literal P0, "a", $advance
+CODE
+<><a><>
+OUTPUT
+
+output_is(gentest('b', <<'CODE'), <<'OUTPUT', 'A is not B');
+ rx_literal P0, "a", $advance
+CODE
+no match
+OUTPUT
+
+output_is(gentest('ba', <<'CODE'), <<'OUTPUT', 'inching through the string');
+ rx_literal P0, "a", $advance
+CODE
+<b><a><>
+OUTPUT
+
+output_is(gentest('a', <<'CODE'), <<'OUTPUT', 'character classes (successful)');
+ rx_oneof P0, "aeiou", $advance
+CODE
+<><a><>
+OUTPUT
+
+output_is(gentest('b', <<'CODE'), <<'OUTPUT', 'character classes (failure)');
+ rx_oneof P0, "aeiou", $advance
+CODE
+no match
+OUTPUT
+
+output_is(gentest('a', <<'CODE'), <<'OUTPUT', 'dot (success)');
+ rx_dot P0, $advance
+CODE
+<><a><>
+OUTPUT
+
+output_is(gentest('\n', <<'CODE'), <<'OUTPUT', 'dot (failure)');
+ rx_dot P0, $advance
+CODE
+no match
+OUTPUT
+
+output_is(gentest('aA9_', <<'CODE'), <<'OUTPUT', '\w (success)');
+ rx_is_w P0, $advance
+ rx_is_w P0, $advance
+ rx_is_w P0, $advance
+ rx_is_w P0, $advance
+CODE
+<><aA9_><>
+OUTPUT
+
+output_is(gentest('?', <<'CODE'), <<'OUTPUT', '\w (failure)');
+ rx_is_w P0, $advance
+CODE
+no match
+OUTPUT
+
+output_is(gentest('ba', <<'CODE', 'r'), <<'OUTPUT', 'reversed regexen (/r)');
+ rx_dot P0, $advance
+CODE
+<b><a><>
+OUTPUT
+
+output_is(gentest('\n', <<'CODE', 's'), <<'OUTPUT', 'single-line regexen (/s)');
+ rx_dot P0, $advance
+CODE
+<><
+><>
+OUTPUT
+
+output_is(gentest('a', <<'CODE'), <<'OUTPUT', 'stack (pushindex/popindex)');
+ rx_pushindex P0
+ rx_literal P0, "a", $advance
+ rx_popindex P0, $advance
+CODE
+<><><a>
+OUTPUT
+
+output_is(gentest('a', <<'CODE'), <<'OUTPUT', 'stack (pushmark)');
+ rx_pushmark P0
+ rx_pushindex P0
+ rx_literal P0, "a", $advance
+ rx_popindex P0, $advance
+ rx_popindex P0, $advance
+CODE
+no match
+OUTPUT
+
+TODO: {
+ local $TODO="pending key fixes" if $^O eq "MSWin32";
+output_is(gentest('a', <<'CODE'), <<'OUTPUT', 'groups');
+ rx_startgroup P0, 0
+ rx_literal P0, "a", $advance
+ rx_endgroup P0, 0
+
+ rx_info_getgroup P0, I1, I2, 0
+ sub I2, I2, I1
+ substr S1, S0, I1, I2
+ print "("
+ print S1
+ print ")\n"
+CODE
+(a)
+<><a><>
+OUTPUT
+}
+
+output_is(gentest('a', <<'CODE'), <<'OUTPUT', 'ZWA: ^ (success)');
+ rx_zwa_atbeginning P0, $advance
+ rx_literal P0, "a", $advance
+CODE
+<><a><>
+OUTPUT
+
+output_is(gentest('b', <<'CODE'), <<'OUTPUT', 'ZWA: ^ (failure)');
+ rx_zwa_atbeginning P0, $advance
+ rx_literal P0, "a", $advance
+CODE
+no match
+OUTPUT
+
+output_is(gentest('a', <<'CODE'), <<'OUTPUT', 'ZWA: $ (success)');
+ rx_literal P0, "a", $advance
+ rx_zwa_atend P0, $advance
+CODE
+<><a><>
+OUTPUT
+
+output_is(gentest('ab', <<'CODE'), <<'OUTPUT', 'ZWA: $ (failure)');
+ rx_literal P0, "a", $advance
+ rx_zwa_atend P0, $advance
+CODE
+no match
+OUTPUT
+
+output_is(gentest('a?', <<'CODE'), <<'OUTPUT', 'ZWA: \b (success)');
+ rx_literal P0, "a", $advance
+ rx_zwa_boundary P0, $advance
+CODE
+<><a><?>
+OUTPUT
+
+output_is(gentest('ab', <<'CODE'), <<'OUTPUT', 'ZWA: \b (failure)');
+ rx_literal P0, "a", $advance
+ rx_zwa_boundary P0, $advance
+CODE
+no match
+OUTPUT
+
+1;
\ No newline at end of file
--- /parrot-cvs/string.c Tue Jan 8 20:34:54 2002
+++ /parrot/string.c Tue Jan 8 22:07:40 2002
@@ -300,9 +300,15 @@
UINTVAL true_length;
true_offset = (UINTVAL)offset;
+
+ if (idx == string_length(src) && length < 1) {
+ return NULL;
+ }
+
if (offset < 0) {
true_offset = (UINTVAL) (src->strlen + offset);
}
+
if (true_offset > src->strlen-1) { /* 0 based... */
INTERNAL_EXCEPTION(SUBSTR_OUT_OF_STRING,
"Cannot take substr outside string")