Hi all,
I have developed some adittions that give Parrot a limited
amount of support to regular expressions.
It all started as a little experiment to find out what the
"compile down to low-level ops" thing could mean
someday.
The patch consists of:
* 5 new opcodes:
- matchexactly
- matchanychar
- initbrstack
- clearbrstack
- backtrack
- savebr
The first two are the ones that actually implement the
matches.
initbrstack, clearbrstack, backtrack, savebr are for
managing the stack of pending possible matches. They
use internally the integer and destination stack.
* A perl package and script that implement a simple regex
compiler (using YAPE::Regex by the way).
The compiler currently outputs a parrot program that
matches the regexp against a predefined string. It could
be easily modified to proceduce something more useful.
Currently, the following features are supported.
* exact matches
* any char (.)
* nested groups (do not capture)
* alternation
* simple quantifires (*, + ?)
There is a lot of room for improvment, either by
implementing features that do not require changes in
Parrot (non-greedy-quantifiers, anchors, capturing
and most of regex options can be added right now)
or by making the necessary changes in Parrot
(support for locales are required for macros, etc..).
This is not a serious patch, in the sense that there
are many things missing, the ones that are supposed
to work are not tested enough and even the ones
that work are implemented in a way that is just wrong.
I am a rather mediocre programmer, and this are the first
lines of code i ever sent to a mailing list, so please be
benevolent with me. :)
Anyway I thought it would be interesting to share my
little experiment.
Sincerly,
-------------------
Angel Faus
[EMAIL PROTECTED]
1814a1815,1882
> ########################################
>
> AUTO_OP matchexactly(sc, s, i, ic){
>
> STRING* temp;
>
>
> if (string_length($2) <= $3) {
> RETREL($4);
> }
>
> temp = string_substr(interpreter, $2, $3 , string_length($1), NULL);
>
> if (string_compare(interpreter, $1, temp) != 0 ) {
> RETREL($4);
> }
> else {
> $3 = $3 + string_length($1);
> }
> }
>
> AUTO_OP matchanychar(s, i, ic) {
> if (string_length($1) > $2){
> $2++;
> }
> else {
> RETREL($3);
> }
> }
>
> MANUAL_OP backtrack(i){
> opcode_t *dest;
>
> pop_generic_entry(interpreter, &interpreter->user_stack_top, &($1), STACK_ENTRY_INT);
> pop_generic_entry(interpreter, &interpreter->control_stack_top, &dest, STACK_ENTRY_DESTINATION);
>
> RETABS(dest);
> }
>
>
> AUTO_OP savebr(i, ic){
>
> push_generic_entry(interpreter, &interpreter->control_stack_top, cur_opcode + cur_opcode[2], STACK_ENTRY_DESTINATION, NULL);
>
> push_generic_entry(interpreter, &interpreter->user_stack_top, &($1), STACK_ENTRY_INT, NULL);
>
> }
>
> AUTO_OP initbrstack(ic) {
> INTVAL i;
> i = -1;
>
> push_generic_entry(interpreter, &interpreter->control_stack_top, cur_opcode + cur_opcode[1], STACK_ENTRY_DESTINATION, NULL);
> push_generic_entry(interpreter, &interpreter->user_stack_top, &i, STACK_ENTRY_INT, NULL);
>
> }
>
> AUTO_OP clearbrstack(i){
> opcode_t *dest;
>
> while ($1 && $1 >= 0) {
> pop_generic_entry(interpreter, &interpreter->control_stack_top, &dest, STACK_ENTRY_DESTINATION);
> pop_generic_entry(interpreter, &interpreter->user_stack_top, &($1), STACK_ENTRY_INT);
> }
>
> }
>
>
1826a1895
>
package BabyRegex;
use YAPE::Regex 'BabyRegex';
use strict;
use vars '$VERSION';
$VERSION = '0.01';
my %modes = ( on => '', off => '' );
sub buildtree {
my $self = shift;
my $cnt = 0;
my ($groupscnt, @groups);
my @tree;
while (my $node = $self->next) {
$node->id($cnt++);
$tree[-1]->next($node) if @tree;
if ($node->type =~ /capture|group/) {
push @groups, $node;
$node->{ALTS} = [];
$node->{COUNT} = $groupscnt++;
}
if ($node->type eq "alt") {
push (@{$groups[-1]->{ALTS}}, $node);
my $groupnode = $groups[-1];
$node->{GROUP} = $groupnode;
push @{$groupnode->{ALTS}}, $node,
}
if ($node->type eq "close"){
my $groupnode = pop @groups;
$groupnode->{CLOSED} = $node;
$node->{GROUP} = $groupnode;
for my $alt (@{$groupnode->{ALTS}}) {
#Alt nodes get its ID replaced by the Closing node ID, so
#that the when its antecessors calls ->next->id it gets the good one.
#This is probably on of the worse to do that.
$alt->{ID} = $node->{ID};
}
}
push (@tree, $node);
}
return @tree;
}
sub cry {
if (@_[1]) {
my $label = shift;
my $opcode = shift;
my $spc = " " x (4 - length($label) ) ;
print $label. ":" . $spc . $opcode . "\n";
}
else {
my $opcode = shift;
print " $opcode\n";
}
}
sub pasm {
my ($self, $string) = @_;
my @tree = $self->buildtree;
cry "INIT", "initbrstack FAIL";
cry "set I1, 0";
cry "set S1, \"$string\"";
for my $node (@tree) {
$node->pasm($self);
#print $node->type;
}
cry "OK", "print \"match\"";
cry "clearbrstack I1";
cry "end";
print "\n";
cry "FAIL", "print \"fail\"";
cry "clearbrstack I1";
cry "end";
print "\n";
cry "BT", "backtrack I1";
print "\n";
}
##
## shared methods
##
sub BabyRegex::Element::id {
my $self = shift;
my $id = shift;
if ($self->{ID}) { return $self->{ID} }
else {
$self->{ID} = "L" . $id;
}
}
sub BabyRegex::Element::next {
my $self = shift;
my $next = shift;
if ($next) {
$self->{NEXT} = $next;
return $next;
}
else {
return $self->{NEXT}
}
}
sub BabyRegex::Element::cry_atomic {
my $self = shift;
my $opcode = shift;
my $id = $self->id;
if ($self->quant eq "*") {
my $nextid = $self->next()->id();
cry $id, "savebr I1, $nextid";
cry $opcode;
cry "branc $id";
} elsif ($self->quant eq "+" ) {
my $nextid = $self->next()->id();
cry $id, $opcode;
cry "savebr I1, $nextid";
cry "branch $id";
} elsif ($self->quant eq "?" ) {
my $nextid = $self->next()->id();
cry $id, "savebr I1, $nextid";
cry $opcode;
}
else {
cry $id, $opcode;
}
}
##
## each element pasm
##
sub BabyRegex::anchor::pasm {
my $self = shift;
my $type = $self->{TEXT};
print $type;
}
sub BabyRegex::macro::pasm { die "unimplemented\n"; }
sub BabyRegex::oct::explanation {
die "unimplemented - too lazy\n";
}
sub BabyRegex::hex::explanation {
die "unimplemented - too lazy\n";
}
sub BabyRegex::utf8hex::explanation {
die "unimplemented - too lazy\n";
}
sub BabyRegex::ctrl::explanation {
die "unimplemented - too lazy\n";
}
sub BabyRegex::named::explanation {
die "unimplemented - too lazy\n";
}
sub BabyRegex::Cchar::explanation {
die "unimplemented - too lazy\n";
}
sub BabyRegex::any::pasm {
my $self = shift;
my $l;
my $id = $self->id;
if ($modes{on} =~ /s/) {
$self->cry_atomic ("matchanychar S1, I1, BT");
} else {
#$self->cry_atomic ("matchanycharbutnl S1, I1, BT");
#we don't have the opcode anyway
$self->cry_atomic ("matchanychar S1, I1, BT");
}
}
sub BabyRegex::text::pasm {
my $self = shift;
my $text = $self->text;
$text =~ s/\n/\\n/g;
$text =~ s/\r/\\r/g;
$text =~ s/\t/\\t/g;
$text =~ s/\f/\\f/g;
$text =~ s/'/\\'/g;
my $id = $self->id();
$self->cry_atomic ("matchexactly \"$text\", S1, I1, BT");
}
sub BabyRegex::alt::pasm {
my $self = shift;
my $id = $self->id();
my $endofgroup_id = $self->{GROUP}->{CLOSED}->id;
cry("branch $endofgroup_id");
}
sub BabyRegex::slash::pasm { die "unimplemented\n"; }
sub BabyRegex::class::pasm { die "unimplemented\n"; }
sub BabyRegex::group::pasm{
my $self = shift;
my $id = $self->id;
my $cnt = $self->{COUNT};
my $fs = substr($self->fullstring,1,30);
print "\n";
cry $id, "#start of n.c. group $cnt ($fs...)";
if ($self->quant eq "*" or $self->quant eq "?") {
cry "savebr I1, ". $self->{CLOSED}->next->id();
}
foreach my $alt (@{$self->{ALTS}}) {
cry "savebr I1, " . $alt->next->id();
}
}
sub BabyRegex::capture::pasm {
# We are not capturing anything yet!
my $self = shift;
my $id = $self->id;
my $cnt = $self->{COUNT};
my $fs = substr($self->fullstring,1,30);
print "\n";
if ($self->quant eq "*" or $self->quant eq "?") {
cry "savebr I1, ". $self->{CLOSED}->next->id();
}
cry $id, "#start of group $cnt ($fs...)";
foreach my $alt (@{$self->{ALTS}}) {
cry "savebr I1, ". $alt->next->id();
}
}
sub BabyRegex::close::pasm {
my $self = shift;
my $id = $self->id;
my $cnt = $self->{GROUP}->{COUNT};
cry $id, "#end of group $cnt";
if ($self->{GROUP}->quant eq "*" or $self->{GROUP}->quant eq "+") {
cry "savebr I1, " . $self->next->id();
cry "branch " . $self->{GROUP}->id;
}
print "\n";
}
sub BabyRegex::comment::pasm { }
sub BabyRegex::whitespace::pasm{ }
sub BabyRegex::lookahead::explanation { die "unimplemented\n"; }
sub BabyRegex::lookbehind::explanation { die "unimplemented\n"; }
sub BabyRegex::code::pasm { die "unimplemented\n"; }
sub BabyRegex::later::pasm { die "unimplemented\n"; }
sub BabyRegex::conditional::pasm { die "unimplemented\n"; }
sub BabyRegex::cut::pasm { die "unimplemented\n"; }
sub BabyRegex::flags::pasm{ die "unimplemented\n"; }
sub BabyRegex::backref::pasm { die "unimplemented \n"; }
1;
__END__
=head1 NAME
BabyRegex - compiles a regular expression down to Parrot bytecode
=head1 SYNOPSIS
use BabyRegex;
BabyRegex->new($REx)->pasm;
=head1 SEE ALSO
The C<YAPE::Regex> documentation.
=head1 AUTHOR
Angel Faus
[EMAIL PROTECTED]
Based in YAPE::Regex::Explain by Jeff Pinyan ([EMAIL PROTECTED])
=cut
use BabyRegex;
unless (@ARGV[0] & @ARGV[1]) {
print 'usage: perl babyre.pl "pattern" "string"' . "\n";
print 'ex: perl babyre.pl "reg(exp?|ular +expression)?" "regex" > regex.pasm' . "\n";
exit;
}
$pattern = @ARGV[0];
$string = @ARGV[1];
$c = BabyRegex->new($pattern);
$c->pasm($string);