Experimental regex support

Angel Faus Thu, 01 Nov 2001 11:05:52 -0800

Hi all,

I have developed some adittions that give Parrot a limited
amount of support to regular expressions.


It all started as a little experiment to find out what the 
"compile down to low-level ops" thing could mean 
someday.

The patch consists of:

* 5 new opcodes:

   - matchexactly
   - matchanychar
   - initbrstack
   - clearbrstack
   - backtrack
   - savebr

  The first two are the ones that actually implement the 
  matches.

  initbrstack, clearbrstack, backtrack, savebr are for
  managing the stack of pending possible matches. They
  use internally the integer and destination stack.

* A perl package and script that implement a simple regex
  compiler (using YAPE::Regex by the way).

  The compiler currently outputs a parrot program that
  matches the regexp against a predefined string. It could
  be easily modified to proceduce something more useful.

Currently, the following features are supported.

* exact matches
* any char (.)
* nested groups (do not capture)
* alternation
* simple quantifires (*, + ?)

There is a lot of room for improvment, either by 
implementing features that do not require changes in 
Parrot (non-greedy-quantifiers, anchors, capturing
and most of regex options can be added right now) 
or by making the necessary changes in Parrot 
(support for locales are required for macros, etc..).

This is not a serious patch, in the sense that there 
are many things missing, the ones that are supposed 
to work are not tested enough and even the ones 
that work are implemented in a way that is just wrong.

I am a rather mediocre programmer, and this are the first 
lines of code i ever sent to a mailing list, so please be 
benevolent with me. :)

Anyway I thought it would be interesting to share my 
little experiment.

Sincerly,

-------------------
Angel Faus
[EMAIL PROTECTED]

1814a1815,1882
> ########################################
> 
> AUTO_OP matchexactly(sc, s, i, ic){
>   
>   STRING* temp;
>  
>   
>   if (string_length($2) <= $3) {
>     RETREL($4);
>     }   
> 
>   temp = string_substr(interpreter, $2, $3 , string_length($1), NULL);
> 
>   if (string_compare(interpreter, $1, temp) != 0 ) {
>     RETREL($4);
>   }
>   else {
>     $3 = $3 + string_length($1);
>   }
> }  
> 
> AUTO_OP matchanychar(s, i, ic) {
>    if (string_length($1) > $2){   
>       $2++;
>       }
>    else {
>       RETREL($3);
>    }
> }
>        
> MANUAL_OP backtrack(i){
>   opcode_t *dest;
> 
>   pop_generic_entry(interpreter, &interpreter->user_stack_top, &($1), STACK_ENTRY_INT);
>   pop_generic_entry(interpreter, &interpreter->control_stack_top, &dest, STACK_ENTRY_DESTINATION);
> 
>   RETABS(dest);
> }
> 
> 
> AUTO_OP savebr(i, ic){
>  
>   push_generic_entry(interpreter, &interpreter->control_stack_top, cur_opcode + cur_opcode[2],  STACK_ENTRY_DESTINATION, NULL);
> 
>   push_generic_entry(interpreter, &interpreter->user_stack_top, &($1),  STACK_ENTRY_INT, NULL);
> 
> }
> 
> AUTO_OP initbrstack(ic) {
>   INTVAL i;
>   i = -1;
>   
>   push_generic_entry(interpreter, &interpreter->control_stack_top, cur_opcode + cur_opcode[1], STACK_ENTRY_DESTINATION, NULL);
>   push_generic_entry(interpreter, &interpreter->user_stack_top, &i, STACK_ENTRY_INT, NULL); 
> 
> }
> 
> AUTO_OP clearbrstack(i){
>   opcode_t *dest;
>   
>   while ($1 && $1 >= 0) {
> 	pop_generic_entry(interpreter, &interpreter->control_stack_top, &dest, STACK_ENTRY_DESTINATION);
> 	pop_generic_entry(interpreter, &interpreter->user_stack_top, &($1), STACK_ENTRY_INT); 
> 	}
> 	
> }
> 
> 
1826a1895
>



package BabyRegex;

use YAPE::Regex 'BabyRegex';
use strict;
use vars '$VERSION';

$VERSION = '0.01';

my %modes = ( on => '', off => '' );

sub buildtree {
  my $self = shift;
  
  my $cnt = 0;
  my ($groupscnt, @groups);
  my @tree;
  
  while (my $node = $self->next) {            
    
    $node->id($cnt++);
    $tree[-1]->next($node) if @tree;  
    
    if ($node->type =~ /capture|group/) {
    	push @groups, $node;
    	$node->{ALTS} = [];
    	$node->{COUNT} = $groupscnt++;
    	}    	
    	
    if ($node->type eq "alt") 	 {
    	push (@{$groups[-1]->{ALTS}}, $node);
	my $groupnode = $groups[-1];
    	$node->{GROUP} = $groupnode;
	
	push @{$groupnode->{ALTS}}, $node,  
	}
    
    if ($node->type eq "close"){
    	my $groupnode = pop @groups;
    	$groupnode->{CLOSED} = $node;
    	$node->{GROUP} = $groupnode;
	for my $alt (@{$groupnode->{ALTS}}) {
	    #Alt nodes get its ID replaced by the Closing node ID, so 
	    #that the when its antecessors calls ->next->id it gets the good one.
	    #This is probably on of the worse to do that.
	    $alt->{ID} = $node->{ID};
	    }
        }
    push (@tree, $node);      
    }
    
  return @tree;  
   
}

sub cry {
  if (@_[1]) {
  	my $label = shift;
  	my $opcode = shift;
  
  	my $spc = " " x (4 - length($label) ) ;
  	print $label. ":" . $spc . $opcode . "\n";
  }
  else {
  	my $opcode = shift;
  	print "     $opcode\n";
  }

}


sub pasm {
  my ($self, $string) = @_;  
  my @tree = $self->buildtree;   
  
  cry "INIT", "initbrstack FAIL";
  cry "set I1, 0";
  cry "set S1, \"$string\"";

  for my $node (@tree) {

    $node->pasm($self);    	   	

    #print $node->type;	
  }

  
  cry "OK", "print \"match\"";
  cry "clearbrstack I1";
  cry "end";  
  print "\n";
  
  cry "FAIL", "print \"fail\"";
  cry "clearbrstack I1";
  cry "end";
  print "\n";
  
  cry "BT", "backtrack I1";
  print "\n";
}



##
## shared methods
##

sub BabyRegex::Element::id {
   my $self = shift;
   my $id = shift;
   
   if ($self->{ID}) { return $self->{ID} }
   else {
       $self->{ID} = "L" . $id;
   }

}


sub BabyRegex::Element::next {
   my $self = shift;
   my $next = shift;
   
   if ($next) {
   	$self->{NEXT} = $next;
   	return $next;
   }
   else {
   	return $self->{NEXT}
   }
   
}



sub BabyRegex::Element::cry_atomic {
  my $self = shift;
  my $opcode = shift;
  my $id = $self->id;
  
  if ($self->quant eq "*") {  	  	
  	my $nextid = $self->next()->id();
  	cry $id, "savebr I1, $nextid";
  	cry $opcode;
  	cry "branc $id";

  } elsif ($self->quant eq "+" ) {
  	my $nextid = $self->next()->id();
  	cry $id, $opcode;
  	cry "savebr I1, $nextid";
  	cry "branch $id";  	

  } elsif ($self->quant eq "?" ) {
  	my $nextid = $self->next()->id();
	cry $id, "savebr I1, $nextid";
	cry $opcode;  
  }
  else {
  	cry $id, $opcode;
  }  	
}

##
## each element pasm 
##



sub BabyRegex::anchor::pasm {
  my $self = shift;
  my $type = $self->{TEXT};
  print $type;
}


sub BabyRegex::macro::pasm { die "unimplemented\n"; }


sub BabyRegex::oct::explanation {  
  die "unimplemented - too lazy\n";
}

sub BabyRegex::hex::explanation {
  die "unimplemented - too lazy\n";
}

sub BabyRegex::utf8hex::explanation {
  die "unimplemented - too lazy\n";
}

sub BabyRegex::ctrl::explanation {
  die "unimplemented - too lazy\n";
}

sub BabyRegex::named::explanation {
  die "unimplemented - too lazy\n";
}

sub BabyRegex::Cchar::explanation {
  die "unimplemented - too lazy\n";
}


sub BabyRegex::any::pasm {
  my $self = shift;  
  my $l;
  my $id = $self->id;
    
  if ($modes{on} =~ /s/) {
  	$self->cry_atomic ("matchanychar S1, I1, BT");
  } else {
	#$self->cry_atomic ("matchanycharbutnl S1, I1, BT");
	#we don't have the opcode anyway
	$self->cry_atomic ("matchanychar S1, I1, BT");
  }

}


sub BabyRegex::text::pasm {
  my $self = shift;
  my $text = $self->text;
  
  $text =~ s/\n/\\n/g;
  $text =~ s/\r/\\r/g;
  $text =~ s/\t/\\t/g;
  $text =~ s/\f/\\f/g;
  $text =~ s/'/\\'/g;
  
  my $id = $self->id();

  $self->cry_atomic ("matchexactly \"$text\", S1, I1, BT");
  
}


sub BabyRegex::alt::pasm {
  my $self = shift;
  my $id = $self->id();
  my $endofgroup_id = $self->{GROUP}->{CLOSED}->id;
    
  cry("branch $endofgroup_id");

}


sub BabyRegex::slash::pasm {  die "unimplemented\n"; }

sub BabyRegex::class::pasm {  die "unimplemented\n"; }


sub BabyRegex::group::pasm{
  my $self = shift;
  
  my $id = $self->id;
  my $cnt = $self->{COUNT};  
  my $fs = substr($self->fullstring,1,30);
  
  print "\n";
  cry $id, "#start of n.c. group $cnt        ($fs...)";

  if ($self->quant eq "*" or $self->quant eq "?") {
  	cry "savebr I1, ". $self->{CLOSED}->next->id(); 	
  }
 
  foreach my $alt (@{$self->{ALTS}}) {
  	cry "savebr I1, " . $alt->next->id();
  }
  
}


sub BabyRegex::capture::pasm {
    
  # We are not capturing anything yet! 
  
  my $self = shift;
  my $id = $self->id;
  my $cnt = $self->{COUNT};
  my $fs = substr($self->fullstring,1,30);
  
  print "\n";
  
  if ($self->quant eq "*" or $self->quant eq "?") {
  	cry "savebr I1, ". $self->{CLOSED}->next->id(); 	
  }
  	
  cry $id, "#start of group $cnt             ($fs...)";
      
  foreach my $alt (@{$self->{ALTS}}) {
  	cry "savebr I1, ". $alt->next->id();
  }
}


sub BabyRegex::close::pasm {
  my $self = shift;  
  my $id = $self->id;
  my $cnt = $self->{GROUP}->{COUNT};

  cry $id, "#end of group $cnt";
  
  if ($self->{GROUP}->quant eq "*" or $self->{GROUP}->quant eq "+") {
  	cry "savebr I1, " .  $self->next->id();
  	cry "branch " . $self->{GROUP}->id;
  	}

  print "\n";
  
}


  
sub BabyRegex::comment::pasm { }

sub BabyRegex::whitespace::pasm{ }


sub BabyRegex::lookahead::explanation { die "unimplemented\n"; }

sub BabyRegex::lookbehind::explanation { die "unimplemented\n"; }

sub BabyRegex::code::pasm {  die "unimplemented\n"; }

sub BabyRegex::later::pasm { die "unimplemented\n"; }

sub BabyRegex::conditional::pasm { die "unimplemented\n"; }

sub BabyRegex::cut::pasm { die "unimplemented\n"; }

sub BabyRegex::flags::pasm{ die "unimplemented\n"; }

sub BabyRegex::backref::pasm { die "unimplemented \n"; }





1;

__END__

=head1 NAME

BabyRegex - compiles a regular expression down to Parrot bytecode

=head1 SYNOPSIS

  use BabyRegex;
  BabyRegex->new($REx)->pasm;

=head1 SEE ALSO

The C<YAPE::Regex> documentation.

=head1 AUTHOR

  Angel Faus
  [EMAIL PROTECTED]
  
  Based in YAPE::Regex::Explain by Jeff Pinyan ([EMAIL PROTECTED])

=cut


use BabyRegex;

unless (@ARGV[0] & @ARGV[1]) {
	print 'usage: perl babyre.pl "pattern" "string"' . "\n";
	print 'ex:    perl babyre.pl "reg(exp?|ular +expression)?" "regex" > regex.pasm' . "\n";  
	exit;
}	
	
$pattern = @ARGV[0];
$string = @ARGV[1];

$c = BabyRegex->new($pattern);
$c->pasm($string);

Experimental regex support

Reply via email to