p6ge Makefile README demo.pir p6ge.pir p6ge_gen.c p6ge_parse.c p6ge_parsep5.c

Patrick R . Michaud Thu, 18 Nov 2004 14:35:06 -0800

cvsuser     04/11/18 14:34:52


  Added:       compilers/p6ge Makefile README demo.pir p6ge.pir p6ge_gen.c
                        p6ge_parse.c p6ge_parsep5.c
  Log:
  First official/public release of p6ge.
  
  Revision  Changes    Path
  1.1                  parrot/compilers/p6ge/Makefile
  
  Index: Makefile
  ===================================================================
  PARROT = ../..
  CFLAGS = -I $(PARROT)/include
  p6ge.so: p6ge_parse.o p6ge_gen.o p6ge_parsep5.o
        $(CC) -shared -fpic p6ge_parse.o p6ge_gen.o p6ge_parsep5.o -o p6ge.so
  
  p6ge_gen.o: p6ge_gen.c p6ge.h
  p6ge_parse.o: p6ge_parse.c p6ge.h
  p6ge_parsep5.o: p6ge_parsep5.c p6ge.h
  
  
  
  1.1                  parrot/compilers/p6ge/README
  
  Index: README
  ===================================================================
  =head1 Perl 6 Grammar Engine (P6GE)
  
  This is an initial implementation of a Perl 6 Grammar Engine designed
  to run in Parrot.  It's definitely a work in progress, and much of the
  current implementation is designed simply to "bootstrap" us along
  (i.e., some parts such as the parser and generator are expected to be
  discarded).  The current work is also largely incomplete -- although
  it has support for groups (capturing and non-capturing), quantifiers, 
  alterations, etc., many of the standard assertions and character classes 
  are not implemented yet but will be coming soon.
  
  In addition there's some experimental work here in being able to parse 
  and convert Perl *5* regular expressions into PIR subroutines, but the
  focus should be on building the perl6 capabilities first and we'll
  fold in perl 5 syntax a bit later.
  
  =head1 Installation
  
  P6GE assumes that it is part of the parrot distribution in the 
  F<compilers/p6ge> directory.   Simply type C<make> in this directory
  to build the F<p6ge.so> shared library it needs.  You can
  either leave this file in the current directory (and set LD_LIBRARY_PATH
  or equivalent appropriately), or move it to F<runtime/parrot/dynext>.
  (XXX: Need to update Parrot configure so this happens automatically?)
  
  The distribution comes with a small F<demo.pir> program that gives an
  example of using P6GE.  To run the demo, simply do 
  C<parrot demo.pir>.  In the demo, you can use a leading slash to enter
  a rule to be compiled (note: no trailing slash), you can enter a 
  string to be matched against the previously entered rule, you can enter
  a "?" to see the PIR subroutine for the rule, and you can enter a "+"
  to re-invoke the rule on the previous string.
  
  If you get errors about "cannot open shared object file", that usually
  means that Parrot was unable to locate the p6ge.so object file.
  
  =head1 Using P6GE
  
  Because I couldn't find a lot of detailed information about writing a
  Parrot compiler in C, I went ahead and wrote this using Parrot's
  native call interface.  Essentially the library provides a 
  C<p6ge_p6rule_pir()> function that builds a PIR code string from a 
  given rule string, this PIR code string can then be sent to the PIR 
  compiler to get a final subroutine.  (Having the PIR code string 
  easily available and printable is also very handy for debugging as 
  we build the rest of the grammar engine.)
  
  Here's a short example for compiling a regular expression and then
  matching it against another string:
  
      load_bytecode "p6ge.pir"
      .local pmc p6ge_compile
      p6ge_compile = global "_p6ge_compile"  # get the compiler
  
      .local string pattern       
      .local pmc rulesub                     
      set pattern, "^(From|Subject):"        # pattern to compile
      rulesub = p6ge_compile(pattern)        # compile it to rulesub
  
      .local pmc match
      $S0 = "From: [EMAIL PROTECTED]"       # target string
      match = rulesub($S0)                   # execute rule on target string
  
    match_loop:
      unless match goto match_fail           # if match fails stop
      print "match succeeded\n"
  
      match."_print"()                       # display captures ($0, $1, etc.)
  
      match."_next"()                        # find the next match
      goto match_loop
  
    match_fail:
      print "match failed\n"            
  
  One can also get the intermediate PIR code that P6GE generates for
  the rule subroutine -- just use
  
      (rule, $S0) = p6ge_compile(target)
  
  and you can print/inspect the contents of $S0 to see the generated code.
  
  =head1 Known Limitations
  
  At the moment, P6GE is not very aware of Unicode strings -- it's pretty 
  much limited to 8-bit Latin-1 characters in patterns.  This is because the
  P6GE parser is still in C, and because support for character class 
  operations in Parrot is still being worked out.  Eventually these
  things will be resolved and we'll fix P6GE up for Unicode then.
  
  Most of the backslash sequences (\d, \D, \N, etc.) are unimplemented, 
  although backslashes do properly escape metacharacters in literals.
  
  P6GE doesn't (yet) properly handle nested repetitions of zero-length 
  patterns in groups -- that's coming next.
  
  This is just the first-cut framework for building the 
  remainder of the engine, so many items (assertions, lookaround, 
  conjunctions, closures, character classes, hypotheticals, 
  backreferences, etc.) just aren't implemented yet.  They're
  on their way!
  
  Also, many well-known optimizations (e.g., Boyer-Moore) aren't 
  implemented yet -- my primary goals at this point are to
  "release early, often" and to get sufficient features in place so
  that more people can be testing and building upon the engine.
  
  Lastly, error handling needs to be improved, but this will likely
  be decided as we discover how P6GE integrates with the rest of
  Parrot.
  
  =head1 Implementation notes
  
  Basically, P6GE consists of a parser and a PIR code generator written
  in C.  The parser takes a string representing a P6 rule and builds a
  parse tree for the rule.  The generator then produces a PIR subroutine
  (matching engine) that can match strings according to the components 
  of the rule.
  
  The matching engine uses bsr/ret for its internal subroutine calls 
  (also optimized for tailcalls) and then uses Parrot calling 
  conventions for all interfaces with external callers/callees.  
  The compiler is designed such that we can switch the matching engine 
  to use PCC for its internal calls at some point in the future.
  
  P6GE also uses Parrot coroutines for the matching
  engine, so that after a successful match is found, the 
  next match within the same string can be found by simply 
  returning control to the matching coroutine (which then
  picks up from where it had previously left off until
  another match is discovered).
  
  =cut
  
  
  
  1.1                  parrot/compilers/p6ge/demo.pir
  
  Index: demo.pir
  ===================================================================
  .sub _main
      .local string x
      .local string pattern
      .local string patpir
      .local pmc rule
      .local pmc stdin
      .local pmc match
      .local pmc p6ge_compile
  
      load_bytecode "p6ge.pir"
      p6ge_compile = global "_p6ge_compile"
      
  
    read_loop:
      print "input /pattern, string to match, + to continue match,"
      print " ? to print pir,\n"
      getstdin stdin
      readline x, stdin
      length $I0, x 
      if $I0 < 1 goto end_loop
      $S0 = substr x, 0, 1                   # determine command type
      if $S0 == "/" goto load_pattern        # /pattern
      if $S0 == "?" goto print_pir           # ? == print pir
      if $S0 == "+" goto match_again         # + == repeat last match
      chopn x,1
      match = rule(x)                        # perform a match on x
      goto match_result
    match_again:
      match."_next"()
    match_result:
      unless match goto match_fail
      print "match succeeded:\n"
      match."_print"()
      goto read_loop
    match_fail:
      print "match failed\n"
      goto read_loop
  
    load_pattern:
      $S0 = substr x, -1, 1
      if $S0 != '/' goto sans_slash
      chopn x,1
    sans_slash:
      pattern = substr x, 1                  # get the pattern
      (rule, patpir) = p6ge_compile(pattern) # compile to PIR code
      goto read_loop
  
    print_pir:
      print "pattern = /"
      print pattern
      print "/\nPIR subroutine =\n"
      print patpir
      goto read_loop
    
    end_loop:
  .end
  
  
  
  1.1                  parrot/compilers/p6ge/p6ge.pir
  
  Index: p6ge.pir
  ===================================================================
  =head1 TITLE
  
  p6ge.pir - main module for the Perl 6 Grammar Engine
  
  =head1 SYNOPSIS
  
      ...
      .local pmc p6gec
      loadbytecode "p6ge.pir"
      p6gec = global "_p6ge_compile"
      ...
      rule = p6gec("^(From|To):")
      match = rule($S0)
      ...
  
  =head1 DESCRIPTION
  
  This module contains the routines and constants necessary to
  use the Perl 6 Grammar Engine.
  
  =head1 FUNCTIONS
  
  =over 4
  
  =cut
  
  .sub __onload @LOAD
      newclass $P0, "P6GEMatch"
      addattribute $P0, ".target"            # string to be matched
      addattribute $P0, ".rulecor"           # match coroutine
      addattribute $P0, ".state"             # result of the match
      addattribute $P0, ".rephash"           # repeats hash (key=groupid)
      addattribute $P0, ".caphash"           # captures hash (key=groupid)
  .end
  
  =item sub = _p6ge_compile( pattern )
  
  =item (sub, pir) = _p6ge_compile( pattern )
  
  Compiles a string containing a pattern into a subroutine that can
  be used to match strings containing the pattern.  The first form
  simply returns a subroutine; the second form also returns a string
  containing the intermediate PIR code that was generated to produce
  the subroutine.
  
  =cut
  
  .sub _p6ge_compile
      .param string pattern
      .local pmc p6gec
      .local string pir
      .local pmc rulesub
      loadlib $P0, "p6ge"                    # load p6ge.so
      dlfunc p6gec, $P0, "p6ge_p6rule_pir", "tt"  # find the p6ge compiler
      
      pir = p6gec(pattern)                   # compile to PIR
      compreg $P0, "PIR"                     # get the PIR compiler
      rulesub = compile $P0, pir             # compile rule's PIR to a sub
      .return(rulesub, pir)               
  .end
  
  
  .namespace [ "P6GEMatch" ]
  
  .sub _init method
      .param string target
      .param pmc rulecor
      .local pmc state, rephash, caphash
      $P0 = new .PerlString                  # set .target
      $P0 = target
      classoffset $I0, self, "P6GEMatch"       
      setattribute self, $I0, $P0              
      inc $I0                                # set .rulecor
      setattribute self, $I0, rulecor               
      state = new .PerlInt                   # set .state attribute
      state = -1
      inc $I0
      setattribute self, $I0, state         
      rephash = new .PerlHash                # set .rep hash
      inc $I0
      setattribute self, $I0, rephash
      caphash = new .PerlHash                # set .capture hash
      inc $I0
      setattribute self, $I0, caphash
  .end
  
  .sub __get_bool method
      classoffset $I0, self, "P6GEMatch"
      $I0 += 2
      getattribute $P0, self, $I0            # ".status"
      $I1 = $P0
      .return($I1)
  .end
  
  .sub _next method
      .local pmc target
      .local pmc rulecor
      .local pmc status
      .local string target_s
      .local int target_len
      .local int pos
      classoffset $I0, self, "P6GEMatch"
      getattribute target, self, $I0         # ".target" string to match
      inc $I0
      getattribute rulecor, self, $I0        # ".rulecor" coroutine
      inc $I0
      getattribute status, self, $I0         # ".status"
      if status == 0 goto next_end           # already failed, so fail
      target_s = target
      target_len = length target_s
      pos = 0
      (pos) = rulecor(self, target_s, pos, target_len) 
      if pos < 0 goto next_fail
      status = 1
      goto next_end
    next_fail:
      status = 0
    next_end:
      .return(pos)
  .end
  
  .sub _print method
      .local pmc target
      .local pmc caphash
      .local pmc capiter
      .local pmc caparray
      .local pmc groupiter
      .local string gname
      classoffset $I0, self, "P6GEMatch"
      getattribute target, self, $I0         # ".target" string
      $I0 += 4
      getattribute caphash, self, $I0        # ".capture" hash
  
      new groupiter, .Iterator, caphash
      set groupiter, 0                       # XXX: .ITERATE_FROM_START == 0
    group_loop:
      unless groupiter goto key_end
      shift gname, groupiter
      print "  $"
      print gname
      print ":"
      caparray = caphash[gname]
      new capiter, .Iterator, caparray
      set capiter, 0
    cap_loop:
      unless capiter goto cap_end
      shift $I1, capiter
      shift $I2, capiter
      $I3 = $I2 - $I1
      substr $S1, target, $I1, $I3
      print " <"
      print $S1
      print " @ "
      print $I1
      print ">"
      goto cap_loop
    cap_end:
      print "\n"
      goto group_loop
    key_end:
  .end
  
  =back
  
  =head1 AUTHOR
  
  Patrick Michaud ([EMAIL PROTECTED]) is the author and maintainer.
  Patches and suggestions should be sent to the Perl 6 compiler list.
  
  =head1 COPYRIGHT
  
  Copyright (c) 2004, The Perl Foundation.
  
  =cut
  
  
  
  1.1                  parrot/compilers/p6ge/p6ge_gen.c
  
  Index: p6ge_gen.c
  ===================================================================
  /*
  
  =head1 NAME
  
  p6ge/p6ge_gen.c - Generate PIR code from a P6 rule expression
  
  =head1 DESCRIPTION
  
  This file contains the functions designed to convert a P6 rule
  expression (usually generated by p6ge_parse() ) into the PIR code
  that can execute the rule on a string.
  
  =head2 Functions
  
  =over 4
  
  =cut
  
  */
  
  #include "p6ge.h"
  #include "parrot/parrot.h"
  #include <malloc.h>
  #include <stdarg.h>
  
  static char* p6ge_cbuf = 0;
  static int p6ge_cbuf_size = 0;
  static int p6ge_cbuf_len = 0;
  static int p6ge_cbuf_lcount = 0;
  
  static void p6ge_gen_exp(P6GE_Exp* e, const char* succ);
  
  /* emit(...) writes strings to an automatically grown string buffer */
  static void
  emit(const char* fmt, ...)
  {
      int lookahead;
      va_list ap;
      lookahead = p6ge_cbuf_len + P6GE_MAX_LITERAL_LEN * 2 + 3 + strlen(fmt) * 
2;
      if (lookahead > p6ge_cbuf_size) {
          while (lookahead > p6ge_cbuf_size) p6ge_cbuf_size += 4096;
          p6ge_cbuf = realloc(p6ge_cbuf, p6ge_cbuf_size);
      }
      va_start(ap, fmt);
      p6ge_cbuf_len += vsprintf(p6ge_cbuf + p6ge_cbuf_len, fmt, ap);
      va_end(ap);
  }
  
  
  static void
  emitlcount()
  {
      char* s;
      int lcount = 0;
  
      for(s = p6ge_cbuf; *s; s++) { if (*s == '\n') lcount++; }
      if (lcount > p6ge_cbuf_lcount + 10) {
          emit("# line %d\n", lcount);
          p6ge_cbuf_lcount = lcount;
      }
  }
          
  
  static void
  emitsub(const char* sub, ...)
  {
      char* s[10];
      int i;
      va_list ap;
  
      va_start(ap, sub);
      for(i = 0; i < 10; i++) {
          s[i] = va_arg(ap, char*);
          if (!s[i]) break;
          emit("    save %s\n", s[i]);
      }
      emit("    bsr %s\n", sub);
      while (i > 0) emit("    restore %s\n", s[--i]);
  }
  
  
  /* strcon(...) converts string values into PIR string constants */
  static char*
  strcon(const char* s, int len)
  {
      static char esc[P6GE_MAX_LITERAL_LEN * 2 + 3];
      char* t = esc;
      int i;
      *(t++) = '"';
      for(i = 0; i < len; i++) {
          switch (s[i]) {
              case '\\': *(t++) = '\\'; *(t++) = '\\'; break;
              case '"' : *(t++) = '\\'; *(t++) = '"'; break;
              case '\n': *(t++) = '\\'; *(t++) = 'n'; break;
              case '\r': *(t++) = '\\'; *(t++) = 'r'; break;
              case '\t': *(t++) = '\\'; *(t++) = 't'; break;
              case '\0': *(t++) = '\\'; *(t++) = '0'; break;
              default  : *(t++) = s[i]; break;
          }
      }
      *(t++) = '"';
      *t = 0;
      return esc;
  }
     
   
  static void
  p6ge_gen_pattern_end(P6GE_Exp* e, const char* succ)
  {
      emit("R%d:                               # end of pattern\n", e->id);
      emit("    saveall\n");
      emit("    .yield(pos)\n");
      emit("    restoreall\n");
      emit("    goto fail\n");
  }
  
  
  static void
  p6ge_gen_dot(P6GE_Exp* e, const char* succ)
  {
      emit("R%d:                               # dot {%d..%d}%c\n", 
           e->id, e->min, e->max, (e->isgreedy) ? ' ' : '?');
      emit("    maxrep = length target\n");
      emit("    maxrep -= pos\n");
      if (e->max != P6GE_INF) {
          emit("    if maxrep <= %d goto R%d_1\n", e->max, e->id);
          emit("    maxrep = %d\n", e->max);
          emit("  R%d_1:\n", e->id);
      }
      if (e->isgreedy) {
          emit("    rep = maxrep\n");
          emit("    pos += rep\n");
          emit("  R%d_2:\n", e->id);
          emit("    if rep == %d goto %s\n", e->min, succ);
          emit("    if rep < %d goto fail\n", e->min);
          emitsub(succ, "pos", "rep", 0);
          emit("    dec rep\n");
          emit("    dec pos\n");
          emit("    goto R%d_2\n\n", e->id);
      }
      else { /* dot lazy */
          emit("    rep = %d\n", e->min);
          if (e->min > 0) emit("    pos += %d\n", e->min);
          emit("  R%d_3:\n", e->id);
          emit("    if rep == maxrep goto %s\n", succ);
          emit("    if rep > maxrep goto fail\n");
          emitsub(succ, "pos", "rep", "maxrep", 0);
          emit("    inc rep\n");
          emit("    inc pos\n");
          emit("    goto R%d_3\n\n", e->id);
      }
  }
  
  
  static void
  p6ge_gen_literal(P6GE_Exp* e, const char* succ)
  {
      emit("R%d:                               # %.16s {%d..%d}%c\n", 
           e->id, strcon(e->name, e->nlen), e->min, e->max, 
           (e->isgreedy) ? ' ' : '?');
  
      if (e->min==1 && e->max==1) {
          emit("    substr $S0, target, pos, %d\n", e->nlen);
          emit("    if $S0 != %s goto fail\n", strcon(e->name, e->nlen));
          emit("    pos += %d\n", e->nlen);
          emit("    goto %s\n\n", succ);
          return;
      }
  
      if (e->isgreedy) {
          emit("    rep = 0\n");
          emit("  R%d_1:\n", e->id);
          if (e->max != P6GE_INF)
              emit("    if rep >= %d goto R%d_2\n", e->max, e->id);
          emit("    substr $S0, target, pos, %d\n", e->nlen);
          emit("    if $S0 != %s goto R%d_2\n", strcon(e->name, e->nlen), 
e->id);
          emit("    inc rep\n");
          emit("    pos += %d\n", e->nlen);
          emit("    goto R%d_1\n", e->id);
          emit("  R%d_2:\n", e->id);
          emit("    if rep == %d goto %s\n", e->min, succ);
          if (e->min > 0)
              emit("    if rep < %d goto fail\n", e->min);
          emitsub(succ, "pos", "rep", 0);
          emit("    dec rep\n");
          emit("    pos -= %d\n", e->nlen);
          emit("    goto R%d_2\n\n", e->id);
          return;
      } 
      else { /* islazy */
          emit("    rep = 0\n");
          emit("  R%d_1:\n", e->id);
          if (e->max != P6GE_INF) 
              emit("    if rep == %d goto %s\n", e->max, succ);
          if (e->min > 0)
              emit("    if rep < %d goto R%d_2:\n", e->min, e->id);
          emitsub(succ, "pos", "rep", 0);
          emit("  R%d_2:\n", e->id);
          emit("    substr $S0, target, pos, %d\n", e->nlen);
          emit("    if $S0 != %s goto fail\n", strcon(e->name, e->nlen));
          emit("    inc rep\n");
          emit("    pos += %d\n", e->nlen);
          emit("    goto R%d_1\n\n", e->id);
          return;
      } 
  }
  
  
  static void
  p6ge_gen_concat(P6GE_Exp* e, const char* succ)
  {
      char succ2[20];
      
      emit("R%d:                               # concat R%d, R%d\n", 
           e->id, e->exp1->id, e->exp2->id);
      sprintf(succ2,"R%d",e->exp2->id);
      p6ge_gen_exp(e->exp1, succ2);
      p6ge_gen_exp(e->exp2, succ);
  }
  
  
  /* XXX: add some docs that describes how this works! */
  /* XXX: add check to prevent infinite recursion on zero-length match */
  static void
  p6ge_gen_group(P6GE_Exp* e, const char* succ)
  {
      char repsub[32];
      char r1sub[32];
      char key[32];
      char c1, c2;
  
      c1 = '['; c2 = ']';
      if (e->group >= 0) { c1 = '('; c2 = ')'; }
      sprintf(repsub, "R%d_repeat", e->id);
      sprintf(r1sub, "R%d", e->exp1->id);
      sprintf(key,"\"%d\"", e->group);
  
      emit("R%d:                               # group %s %c R%d %c 
{%d..%d}%c\n",
           e->id, key, c1, e->exp1->id, c2, 
           e->min, e->max, (e->isgreedy) ? ' ' : '?');
      /* for unquantified, non-capturing groups, don't bother with the
         group code */
      if (e->min==1 && e->max==1 && e->group<0) {
          p6ge_gen_exp(e->exp1, succ);
          return;
      }
  
      /* otherwise, we have work to do */
  
      /* set the repeat hash */
      emit("    classoffset $I0, match, \"P6GEMatch\"\n");
      emit("    $I0 += 3\n");
      emit("    getattribute gr_rep, match, $I0\n");
      emit("    $I1 = exists gr_rep[%s]\n", key);
      emit("    if $I1 goto R%d_1\n", e->id);
      emit("    new $P1, .PerlInt\n");
      emit("    gr_rep[%s] = $P1\n", key);
      emit("  R%d_1:\n", e->id);
  
      if (e->group >= 0) { 
          emit("    inc $I0\n");
          emit("    getattribute gr_cap, match, $I0\n");
          emit("    $I1 = exists gr_cap[%s]\n", key);
          emit("    if $I1 goto R%d_2\n", e->id);
          emit("    new $P1, .PerlArray\n");
          emit("    gr_cap[%s] = $P1\n", key);
          emit("  R%d_2:\n", e->id);
      }
  
      emit("    $P1 = gr_rep[%s]\n", key);
      emitsub(repsub, "pos", "gr_rep", "$P1", 0);
      emit("    gr_rep[%s] = $P1\n", key);
      emit("    goto fail\n\n");
  
      emit("%s:\n", repsub);
      emit("    classoffset $I0, match, \"P6GEMatch\"\n");
      emit("    $I0 += 3\n");
      emit("    getattribute $P0, match, $I0\n");
      emit("    gr_rep = $P0[%s]\n", key);
      if (e->group >= 0) { 
          emit("    inc $I0\n");
          emit("    getattribute $P0, match, $I0\n");
          emit("    gr_cap = $P0[%s]\n", key);
          emit("    if gr_rep < 1 goto %s_1\n", repsub);  /* save prev cap end 
*/
          emit("    push gr_cap, pos\n");
      }
  
      emit("  %s_1:\n", repsub);
      if (e->isgreedy) {
          if (e->max != P6GE_INF) 
              emit("    if gr_rep >= %d goto %s_2\n", e->max, repsub);
          emit("    inc gr_rep\n");
          if (e->group >= 0)
              emit("    push gr_cap, pos\n");         /* save next cap start */
          emitsub(r1sub, "pos", "gr_cap", "gr_rep", 0);
          if (e->group >= 0)
              emit("    $I0 = pop gr_cap\n");        /* remove next cap start */
          emit("    dec gr_rep\n");
          emit("  %s_2:\n", repsub);
          if (e->min > 0) 
              emit("    if gr_rep < %d goto %s_fail\n", e->min, repsub);
          emitsub(succ, "pos", "gr_cap", "gr_rep", 0);
      } 
      else { /* group lazy */
          if (e->min > 0)
              emit("    if gr_rep < %d goto %s_3\n", e->min, repsub);
          emitsub(succ, "pos", "gr_cap", "gr_rep", 0);
          emit("  %s_3:\n", repsub);
          if (e->max != P6GE_INF)
              emit("    if gr_rep >= %d goto %s_fail\n", e->max, repsub);
          emit("    inc gr_rep\n");
          if (e->group >= 0) 
              emit("    push gr_cap, pos\n");         /* save next cap start */
          emitsub(r1sub, "pos", "gr_cap", "gr_rep", 0);
          if (e->group >= 0)
              emit("    $I0 = pop gr_cap\n");        /* remove next cap start */
          emit("    dec gr_rep\n");
      }  /* group lazy */
  
      emit("  %s_fail:\n", repsub);
      if (e->group >= 0) {
          emit("    if gr_rep < 1 goto fail\n", repsub);  
          emit("    $I0 = pop gr_cap\n");             /* remove prev cap end */
      }
      emit("    goto fail\n\n");
  
      p6ge_gen_exp(e->exp1, repsub);
  }
  
  
  static void
  p6ge_gen_alt(P6GE_Exp* e, const char* succ)
  {
      char r1sub[32];
  
      sprintf(r1sub,"R%d", e->exp1->id);
      emit("R%d:                               # alt R%d | R%d\n", 
           e->id, e->exp1->id, e->exp2->id);
      emitsub(r1sub, "pos", 0);
      emit("    goto R%d\n\n", e->exp2->id);
  
      p6ge_gen_exp(e->exp1, succ);
      p6ge_gen_exp(e->exp2, succ);
  }
      
  
  static void
  p6ge_gen_anchor(P6GE_Exp* e, const char* succ)
  {
  
      switch(e->type) {
      case P6GE_ANCHOR_BOS:
          emit("R%d:                               # ^anchor\n", e->id);
          emit("    if pos != 0 goto fail\n");
          emit("    goto %s\n", succ);
          return;
      case P6GE_ANCHOR_EOS:
          emit("R%d:                               # anchor$\n", e->id);
          emit("    if pos != lastpos goto fail\n");
          emit("    goto %s\n", succ);
          return;
      case P6GE_ANCHOR_BOL:
          emit("R%d:                               # ^^anchor\n", e->id);
          emit("    if pos == 0 goto %s\n", succ);
          emit("    if pos == lastpos goto fail\n");
          emit("    $I0 = lastpos - 1\n");
          emit("    substr $S0, target, $I0, 1\n");
          emit("    if $S0 == \"\\n\" goto %s\n", succ);
          emit("    goto fail\n\n");
          return;
      case P6GE_ANCHOR_EOL:
          emit("R%d:                               # anchor$$\n", e->id);
          emit("    if pos == lastpos goto R%d_1\n", e->id);
          emit("    substr $S0, target, pos, 1\n");
          emit("    if $S0 == \"\\n\" goto %s\n", succ);
          emit("    goto fail\n");
          emit("R%d_1:\n", e->id);
          emit("    $I0 = pos - 1\n");
          emit("    substr $S0, target, $I0, 1\n");
          emit("    if $S0 != \"\\n\" goto %s\n", succ);
          emit("    goto fail\n\n");
          return;
      }
  }
  
  
  static void 
  p6ge_gen_exp(P6GE_Exp* e, const char* succ)
  {
      emitlcount();
      switch (e->type) {
      case P6GE_PATTERN_END: p6ge_gen_pattern_end(e, succ); break;
      case P6GE_DOT: p6ge_gen_dot(e, succ); break;
      case P6GE_LITERAL: p6ge_gen_literal(e, succ); break;
      case P6GE_CONCAT: p6ge_gen_concat(e, succ); break;
      case P6GE_GROUP: p6ge_gen_group(e, succ); break;
      case P6GE_ALT: p6ge_gen_alt(e, succ); break;
      case P6GE_ANCHOR_BOS:
      case P6GE_ANCHOR_EOS: 
      case P6GE_ANCHOR_BOL:
      case P6GE_ANCHOR_EOL: p6ge_gen_anchor(e, succ); break;
      }
  }
  
  
  static char* 
  p6ge_gen(P6GE_Exp* e)
  {
      char r1sub[32];
      p6ge_cbuf_len = 0;
      p6ge_cbuf_lcount = 0;
  
      emit(".sub _P6GE_Rule\n");
      emit("    .param string target\n");
      emit("    .local pmc match\n");
      emit("    .local pmc rulecor\n");
      emit("  class_loaded:\n");
      emit("    find_type $I0, \"P6GEMatch\"\n");
      emit("    new match, $I0\n");
      emit("    newsub rulecor, .Coroutine, _Rule_cor\n");
      emit("    match.\"_init\"(target, rulecor)\n");
      emit("    match.\"_next\"()\n");
      emit("    .return(match)\n");
      emit(".end\n\n");
  
      emit(".sub _Rule_cor\n");
      emit("    .param pmc match\n");
      emit("    .param string target\n");
      emit("    .param int pos\n");
      emit("    .param int lastpos\n");
      emit("    .local int rep\n");
      emit("    .local int maxrep\n");
      emit("    .local pmc gr_rep\n");
      emit("    .local pmc gr_cap\n");
      sprintf(r1sub, "R%d", e->id);
      emitsub(r1sub, 0);
      emit("  fail_forever:\n");
      emit("    .yield(-1)\n");
      emit("    goto fail_forever\n\n");
  
      p6ge_gen_exp(e, 0);
      emit("  fail:\n    pos = -1\n    ret\n");
      emit(".end\n");
  
      return p6ge_cbuf;
  }
  
  /*
  
  =item C<char* p6ge_p6rule_pir(const char* s)>
  
  Converts the rule expression in s to its equivalent PIR code.
  This function calls p6ge_parse() to build an expression tree from
  the string in s, then calls p6ge_gen() to generate a PIR subroutine
  from the expression tree.
  
  =cut
  
  */
  
  char*
  p6ge_p6rule_pir(const char* s)
  {
      P6GE_Exp* e = 0;
      P6GE_Exp* dot0 = 0;
      P6GE_Exp* group0 = 0;
      P6GE_Exp* end = 0;
      char* pir = 0;
  
      dot0 = p6ge_parse_new(P6GE_DOT, 0, 0);
      dot0->min = 0; dot0->max = P6GE_INF; dot0->isgreedy = 0;
  
      group0 = p6ge_parse_new(P6GE_GROUP, p6ge_parse(s), 0);
  
      end = p6ge_parse_new(P6GE_PATTERN_END, 0, 0);
  
      e = p6ge_parse_new(P6GE_CONCAT, dot0,
                         p6ge_parse_new(P6GE_CONCAT, group0, end));
  
      pir = p6ge_gen(e);
      p6ge_parse_free(e);
      return pir;
  }
  
  
  /*
  
  =item C<char* p6ge_p5rule_pir(const char* s)>
  
  Converts the P5 rule expression in s to its equivalent PIR code.
  This function calls p5re_parse() to build an expression tree from
  the string in s, then calls p6ge_gen() to generate a PIR subroutine
  from the expression tree.
  
  =cut
  
  */
  
  char*
  p6ge_p5rule_pir(const char* s)
  {
      P6GE_Exp* e = 0;
      P6GE_Exp* dot0 = 0;
      P6GE_Exp* group0 = 0;
      P6GE_Exp* end = 0;
      char* pir = 0;
  
      dot0 = p6ge_parse_new(P6GE_DOT, 0, 0);
      dot0->min = 0; dot0->max = P6GE_INF; dot0->isgreedy = 0;
  
      group0 = p6ge_parse_new(P6GE_GROUP, p6ge_parsep5(s), 0);
  
      end = p6ge_parse_new(P6GE_PATTERN_END, 0, 0);
  
      e = p6ge_parse_new(P6GE_CONCAT, dot0,
                         p6ge_parse_new(P6GE_CONCAT, group0, end));
  
      pir = p6ge_gen(e);
      p6ge_parse_free(e);
      return pir;
  }
  
  
  /*
  
  =item C<void Parrot_lib_p6ge_init(Parrot_Interp interpreter, PMC* lib)>
  
  Used when this module is loaded dynamically by Parrot's loadlib
  instruction -- automatically initializes the p6ge engine.
  
  =cut
  
  */
  
  void 
  Parrot_lib_p6ge_init(Parrot_Interp interpreter, PMC* lib)
  {
      p6ge_init();
  }
  
  /*
  
  =back
  
  =head1 SEE ALSO
  
  F<p6ge/p6ge.h> and F<p6ge/p6ge_parse.c>
  
  =head1 HISTORY
  
  Initial version by Patrick R. Michaud on 2004.11.16
  
  =cut
  
  */
  
  /*
   * Local variables:
   * c-indentation-style: bsd
   * c-basic-offset: 4
   * indent-tabs-mode: nil
   * End:
   *
   * vim: expandtab shiftwidth=4:
   */
  
  
  
  1.1                  parrot/compilers/p6ge/p6ge_parse.c
  
  Index: p6ge_parse.c
  ===================================================================
  /*
  
  =head1 NAME
  
  p6ge/p6ge_parse.c - Parse a P6 rule into an expression tree
  
  =head1 DESCRIPTION
  
  This file contains the functions for parsing perl 6 regular expressions
  into a tree representation that can then be used to generate regular
  expression code.
  
  =head2 Functions
  
  =over 4
  
  =cut
  
  */
  
  #include "p6ge.h"
  #include <ctype.h>
  #include <malloc.h>
  #include <stdio.h>
  
  const char* p6ge_exp_s[] = {
     "pattern end", "dot", "literal", 
     "concat", "group", "alt",
     "^anchor", "anchor$",
     "^^anchor", "anchor$$"
  };
  
  int p6ge_ctype[256];
  int p6ge_cmeta[256];
  
  static P6GE_Exp* p6ge_parse_expr(P6GE_Text* t);
  
  void
  p6ge_init()
  {
      unsigned int c;
      for(c=0; c<256; c++) {
          p6ge_ctype[c] = (isspace(c)) ? (ctmeta | ctspace) : 0;
          p6ge_cmeta[c] = (!isalnum(c)) ? c : -1;
      }
      p6ge_ctype[0] |= ctmeta | ctket;
      p6ge_ctype['['] |= ctmeta;
      p6ge_ctype['('] |= ctmeta;
      p6ge_ctype['<'] |= ctmeta;
      p6ge_ctype['{'] |= ctmeta;
      p6ge_ctype[']'] |= ctmeta | ctket;
      p6ge_ctype[')'] |= ctmeta | ctket;
      p6ge_ctype['>'] |= ctmeta | ctket;
      p6ge_ctype['}'] |= ctmeta | ctket;
      p6ge_ctype['|'] |= ctmeta | ctket;
      p6ge_ctype['+'] |= ctmeta | ctquant;
      p6ge_ctype['*'] |= ctmeta | ctquant;
      p6ge_ctype['?'] |= ctmeta | ctquant;
      p6ge_ctype['.'] |= ctmeta;
      p6ge_ctype['^'] |= ctmeta;
      p6ge_ctype['$'] |= ctmeta;
      p6ge_ctype['\\'] |= ctmeta;
      p6ge_cmeta['n'] = '\n';
      p6ge_cmeta['0'] = '\0';
  }
  
  
  static void 
  p6ge_parse_error(P6GE_Text* t, const char* msg)
  {
     printf("%s at offset %d (found '%c')\n", msg, t->pos - t->text, *(t->pos));
     t->pos = "";
  }
  
  
  static void 
  p6ge_skip(P6GE_Text* t,int skip) 
  {
      const unsigned char* s = t->pos + skip;
      while (p6ge_ctype[*s] & ctspace) s++;
      while (*s=='#') {
          s++;
          while (*s && *s!='\n') s++;
          while (p6ge_ctype[*s] & ctspace) s++;
      }
      t->pos = s;
  }
  
  /*
  
  =item C<P6GE_Exp* p6ge_parse_new(p6ge_exp_t type,P6GE_Exp* exp1, P6GE_Exp* 
exp2)>
  
  Creates and initializes a new node for an expression tree, setting the 
  type and using exp1 and exp2 as the "left" and "right" subtrees of this node.
  
  =cut
  
  */
  
  P6GE_Exp*
  p6ge_parse_new(p6ge_exp_t type,P6GE_Exp* exp1, P6GE_Exp* exp2)
  {
      static int id = 0;
      P6GE_Exp* e = 0;
      e = malloc(sizeof(*e));
      e->id = id++;
      e->type = type;
      e->nlen = 0;
      e->min = 1;
      e->max = 1;
      e->isgreedy = 1;
      e->group = 0;
      e->name = 0;
      e->exp1 = exp1;
      e->exp2 = exp2;
      return e;
  }
  
  /*
  
  =item C<void p6ge_parse_free(P6GE_Exp* e)>
  
  Frees up the resources associated with the expression tree given by e.
  
  =cut
  
  */
  
  void
  p6ge_parse_free(P6GE_Exp* e)
  {
      if (e->exp1) { p6ge_parse_free(e->exp1); e->exp1 = 0; }
      if (e->exp2) { p6ge_parse_free(e->exp2); e->exp2 = 0; }
      if (e->name) { free(e->name); e->name = 0; }
      free(e);
  }
  
    
  static P6GE_Exp*
  p6ge_parse_literal(P6GE_Text* t)
  {
      static unsigned char lit[P6GE_MAX_LITERAL_LEN];
      P6GE_Exp* e = 0;
      int len = 0;
      int c;
  
      while (len<sizeof(lit) && (c = *(t->pos))) {
          if ((p6ge_ctype[c] & ctmeta) == 0)
              { lit[len++] = c; p6ge_skip(t, 1); }
          else if (c == '\\' && p6ge_cmeta[t->pos[1]] >= 0)
              { lit[len++] = p6ge_cmeta[t->pos[1]]; p6ge_skip(t, 2); }
          else break;
      }
      if (len>0) {
          e = p6ge_parse_new(P6GE_LITERAL, 0, 0);
          e->name = malloc(len); e->nlen = len;
          memcpy(e->name, lit, len);
      }
      return e;
  }
  
  
  static P6GE_Exp*
  p6ge_parse_term(P6GE_Text* t)
  {
      P6GE_Exp* e = 0;
      int c;
      int ctype;
      int type;
  
      p6ge_skip(t, 0);
      c = *(t->pos);
      ctype = p6ge_ctype[c];
      if ((ctype & ctmeta) == 0
             || (c == '\\' && p6ge_cmeta[t->pos[1]] >= 0))
         return p6ge_parse_literal(t);
      if (c == '[') {
         p6ge_skip(t,1);
         e = p6ge_parse_new(P6GE_GROUP, p6ge_parse_expr(t), 0);
         e->group = --t->ncapture;
         if (*(t->pos) != ']') p6ge_parse_error(t, "Missing ']'");
         else p6ge_skip(t, 1);
         return e;
      }
      if (c == '(') {
         p6ge_skip(t,1);
         e = p6ge_parse_new(P6GE_GROUP, p6ge_parse_expr(t), 0);
         e->group = ++t->capture;
         if (*(t->pos) != ')') p6ge_parse_error(t, "Missing ')'");
         else p6ge_skip(t, 1);
         return e;
      }
      if (c == '.') {
         p6ge_skip(t, 1);
         return p6ge_parse_new(P6GE_DOT, 0, 0);
      }
      if (c == '^') {
         type = P6GE_ANCHOR_BOS; p6ge_skip(t, 1);
         if (*(t->pos) == '^') { type = P6GE_ANCHOR_BOL; p6ge_skip(t, 1); }
         return p6ge_parse_new(type, 0, 0);
      }
      if (c == '$') {
         type = P6GE_ANCHOR_EOS; p6ge_skip(t, 1);
         if (*(t->pos) == '$') { type = P6GE_ANCHOR_EOL; p6ge_skip(t, 1); }
         return p6ge_parse_new(type, 0, 0);
      }
      p6ge_parse_error(t, "Unrecognized character");
      return 0;
  }
  
  
  static P6GE_Exp* 
  p6ge_parse_quant(P6GE_Text* t)
  {
      P6GE_Exp* e = p6ge_parse_term(t);
      P6GE_Exp* q = e;
  
      int c = *(t->pos);
      if ((p6ge_ctype[c] & ctquant) == 0) return e;
      p6ge_skip(t, 1);
  
      /* if quantifying a literal string, the quantifier only applies
         to the last char */
      if (e->type == P6GE_LITERAL && e->nlen > 1) {
         q = p6ge_parse_new(P6GE_LITERAL, 0, 0);
         e->nlen--;
         q->name = malloc(1); q->nlen = 1;
         q->name[0] = e->name[e->nlen];
         e = p6ge_parse_new(P6GE_CONCAT, e, q);
      }
  
      if (c == '+') q->max = P6GE_INF;
      else if (c == '?') { q->min = 0; }
      else if (c == '*' && *(t->pos) != '*') { q->min = 0; q->max = P6GE_INF; }
      else if (c == '*') {
          p6ge_skip(t, 1);
          if (*(t->pos) != '{') 
              p6ge_parse_error(t, "Missing { after ** quantifier");
          p6ge_skip(t, 1);
          if (isdigit(*(t->pos))) {
              q->min = q->max = atoi(t->pos);
              while (isdigit(*(t->pos))) t->pos++;
              p6ge_skip(t, 0);
          } else p6ge_parse_error(t, "Missing min value in **{} quantifier");
          if (t->pos[0] == '.' && t->pos[1] == '.') {
              p6ge_skip(t, 2);
              if (t->pos[0] == '.') { q->max = P6GE_INF; p6ge_skip(t, 1); }
              else if (isdigit(*(t->pos))) {
                  q->max = atoi(t->pos);
                  while (isdigit(*(t->pos))) t->pos++;
                  p6ge_skip(t, 0);
              }
              else p6ge_parse_error(t, "Missing max value after '..'");
          }
          if (*(t->pos) != '}') 
              p6ge_parse_error(t, "Missing closing '}'");
          else p6ge_skip(t, 1);
      }
      if (*(t->pos) == '?') { q->isgreedy = 0; p6ge_skip(t,1); }
      return e;
  }
  
  
  static P6GE_Exp*
  p6ge_parse_concat(P6GE_Text* t)
  {
      P6GE_Exp* e = p6ge_parse_quant(t);
      if ((p6ge_ctype[*(t->pos)] & ctket) == 0) 
          e = p6ge_parse_new(P6GE_CONCAT, e, p6ge_parse_concat(t));
      return e;
  }
  
  
  static P6GE_Exp*
  p6ge_parse_alt(P6GE_Text* t)
  {
      P6GE_Exp* e = p6ge_parse_concat(t);
      if (*(t->pos) == '|') {
          p6ge_skip(t, 1);
          e = p6ge_parse_new(P6GE_ALT, e, p6ge_parse_alt(t));
      }
      return e;
  }
        
  static P6GE_Exp*
  p6ge_parse_expr(P6GE_Text* t)
  {
      return p6ge_parse_alt(t);
  }
  
  /*
  
  =item C<P6GE_Exp* p6ge_parse(const char* s)>
  
  Builds a regular expression tree from the string specified in s.
  
  =cut
  
  */
  
  P6GE_Exp*
  p6ge_parse(const char* s)
  {
      P6GE_Text t;
      P6GE_Exp* e = 0;
      t.text = s;
      t.pos = s;
      t.capture = 0;
      t.ncapture = 0;
      return p6ge_parse_expr(&t);
  }
  
  /*
  
  =item C<void p6ge_printexp(FILE* fp, P6GE_Exp* e, int depth)>
  
  Displays a regular expression tree.  This function is likely obsolete.
  
  */
  
  void
  p6ge_printexp(FILE* fp, P6GE_Exp* e, int depth)
  {
      int indent = depth * 4;
      char quant[64];
          sprintf(quant,"{%d..%d}%c", e->min, e->max,
                  (e->isgreedy) ? ' ' : '?');
          switch (e->type) {
          case P6GE_PATTERN_END:
              break;
          case P6GE_LITERAL:
              fprintf(fp, "%*sliteral \"%.*s\" %s\n", indent, "", e->nlen, 
                  e->name, quant);
              break;
          case P6GE_DOT:
              fprintf(fp, "%*sdot %s\n", indent, "", quant);
              break;
          case P6GE_CONCAT:
              p6ge_printexp(fp, e->exp1, depth);
              p6ge_printexp(fp, e->exp2, depth);
              break;
          case P6GE_GROUP:
              fprintf(fp, "%*sGROUP #%d %s\n", indent, "", e->group, quant);
              p6ge_printexp(fp, e->exp1, depth+1);
              break;
          case P6GE_ALT:
              p6ge_printexp(fp, e->exp1, depth+1);
              fprintf(fp, "%*sOR \n", indent, "");
              p6ge_printexp(fp, e->exp2, depth+1);
              break;
          case P6GE_ANCHOR_BOS:
              fprintf(fp, "%*sanchor-bos\n", indent, "");
              break;
          default:
              fprintf(fp,"Unknown expression type: %d\n",e->type);
              break;
          }
  }
  
  /*
  
  =back
  
  =head1 SEE ALSO
  
  =head1 HISTORY
  
  Initial version by Patrick R. Michaud, 2004.11.16
  
  =cut
  
  /*
   * Local variables:
   * c-indentation-style: bsd
   * c-basic-offset: 4
   * indent-tabs-mode: nil
   * End:
   *
   * vim: expandtab shiftwidth=4:
   */
  
  
  
  
  1.1                  parrot/compilers/p6ge/p6ge_parsep5.c
  
  Index: p6ge_parsep5.c
  ===================================================================
  /*
  
  =head1 NAME
  
  p6ge/p6ge_parsep5.c - Parse a P5 regex into an expression tree
  
  =head1 DESCRIPTION
  
  This file contains the functions for parsing perl 5 regular expressions
  into a tree representation that can then be used to generate regular
  expression code.
  
  =head2 Functions
  
  =over 4
  
  =cut
  
  */
  
  #include "p6ge.h"
  #include <ctype.h>
  #include <malloc.h>
  #include <stdio.h>
  
  static P6GE_Exp* p5re_parse_expr(P6GE_Text* t);
  
  static void 
  p5re_parse_error(P6GE_Text* t, const char* msg)
  {
     printf("%s at offset %d (found '%c')\n", msg, t->pos - t->text, *(t->pos));
     t->pos = "";
  }
  
  
  static void 
  p5re_skip(P6GE_Text* t,int skip) 
  {
      const unsigned char* s = t->pos + skip;
      /* while (p5re_ctype[*s] & ctspace) s++;
      while (*s=='#') {
          s++;
          while (*s && *s!='\n') s++;
          while (p5re_ctype[*s] & ctspace) s++;
      } */
      t->pos = s;
  }
  
  #define p5re_parse_new  p6ge_parse_new
  #define p5re_parse_free p6ge_parse_free
  #define p5re_ctype      p6ge_ctype
  #define p5re_cmeta      p6ge_cmeta
  
  static P6GE_Exp*
  p5re_parse_literal(P6GE_Text* t)
  {
      static unsigned char lit[P6GE_MAX_LITERAL_LEN];
      P6GE_Exp* e = 0;
      int len = 0;
      int c;
  
      while (len<sizeof(lit) && (c = *(t->pos))) {
          if ((p5re_ctype[c] & ctmeta) == 0)
              { lit[len++] = c; p5re_skip(t, 1); }
          else if (p5re_ctype[c] & ctspace) 
              { lit[len++] = c; p5re_skip(t, 1); }
          else if (c == '\\' && p5re_cmeta[t->pos[1]] >= 0)
              { lit[len++] = p5re_cmeta[t->pos[1]]; p5re_skip(t, 2); }
          else break;
      }
      if (len>0) {
          e = p5re_parse_new(P6GE_LITERAL, 0, 0);
          e->name = malloc(len); e->nlen = len;
          memcpy(e->name, lit, len);
      }
      return e;
  }
  
  
  static P6GE_Exp*
  p5re_parse_term(P6GE_Text* t)
  {
      P6GE_Exp* e = 0;
      int c;
      int ctype;
      int tt;
  
      p5re_skip(t, 0);
      c = *(t->pos);
      ctype = p5re_ctype[c];
      if ((ctype & ctmeta) == 0 || (ctype & ctspace) != 0 
             || (c == '\\' && p5re_cmeta[t->pos[1]] >= 0))
         return p5re_parse_literal(t);
      if (c == '(') {
         p5re_skip(t,1);
         if (t->pos[0] == '?') { tt = --(t->ncapture);  p5re_skip(t,2); }
         else tt = ++(t->capture);
         e = p5re_parse_new(P6GE_GROUP, p5re_parse_expr(t), 0);
         e->group = tt;
         if (*(t->pos) != ')') p5re_parse_error(t, "Missing ')'");
         else p5re_skip(t, 1);
         return e;
      }
      if (c == '.') {
         p5re_skip(t, 1);
         return p5re_parse_new(P6GE_DOT, 0, 0);
      }
      if (c == '^') {
         p5re_skip(t, 1);
         return p5re_parse_new(P6GE_ANCHOR_BOS, 0, 0);
      }
      if (c == '$') {
         p5re_skip(t, 1);
         return p5re_parse_new(P6GE_ANCHOR_EOS, 0, 0);
      }
      p5re_parse_error(t, "Unrecognized character");
      return 0;
  }
  
  
  static P6GE_Exp* 
  p5re_parse_quant(P6GE_Text* t)
  {
      P6GE_Exp* e = p5re_parse_term(t);
      P6GE_Exp* q = e;
  
      int c = *(t->pos);
      if (c != '{' && (p5re_ctype[c] & ctquant) == 0) return e;
      p5re_skip(t, 1);
  
      /* if quantifying a literal string, the quantifier only applies
         to the last char */
      if (e->type == P6GE_LITERAL && e->nlen > 1) {
         q = p5re_parse_new(P6GE_LITERAL, 0, 0);
         e->nlen--;
         q->name = malloc(1); q->nlen = 1;
         q->name[0] = e->name[e->nlen];
         e = p5re_parse_new(P6GE_CONCAT, e, q);
      }
  
      if (c == '+') q->max = P6GE_INF;
      else if (c == '?') { q->min = 0; }
      else if (c == '*') { q->min = 0; q->max = P6GE_INF; }
      else if (c == '{') {
          if (isdigit(*(t->pos))) {
              q->min = q->max = atoi(t->pos);
              while (isdigit(*(t->pos))) t->pos++;
              p5re_skip(t, 0);
          } else p5re_parse_error(t, "Missing min value in {} quantifier");
          if (t->pos[0] == ',') {
              p5re_skip(t, 1);
              if (isdigit(*(t->pos))) {
                  q->max = atoi(t->pos);
                  while (isdigit(*(t->pos))) t->pos++;
                  p5re_skip(t, 0);
              }
              else q->max = P6GE_INF;
          }
          if (*(t->pos) != '}') 
              p5re_parse_error(t, "Missing closing '}'");
          else p5re_skip(t, 1);
      }
      if (*(t->pos) == '?') { q->isgreedy = 0; p5re_skip(t,1); }
      return e;
  }
  
  
  static P6GE_Exp*
  p5re_parse_concat(P6GE_Text* t)
  {
      P6GE_Exp* e = p5re_parse_quant(t);
      if ((p5re_ctype[*(t->pos)] & ctket) == 0) 
          e = p5re_parse_new(P6GE_CONCAT, e, p5re_parse_concat(t));
      return e;
  }
  
  
  static P6GE_Exp*
  p5re_parse_alt(P6GE_Text* t)
  {
      P6GE_Exp* e = p5re_parse_concat(t);
      if (*(t->pos) == '|') {
          p5re_skip(t, 1);
          e = p5re_parse_new(P6GE_ALT, e, p5re_parse_alt(t));
      }
      return e;
  }
        
  static P6GE_Exp*
  p5re_parse_expr(P6GE_Text* t)
  {
      return p5re_parse_alt(t);
  }
  
  /*
  
  =item C<P6GE_Exp* p5re_parse(const char* s)>
  
  Builds a regular expression tree from the string specified in s.
  
  =cut
  
  */
  
  P6GE_Exp*
  p6ge_parsep5(const char* s)
  {
      P6GE_Text t;
      P6GE_Exp* e = 0;
      t.text = s;
      t.pos = s;
      t.capture = 0;
      t.ncapture = 0;
      return p5re_parse_expr(&t);
  }
  
  
  /*
  
  =back
  
  =head1 SEE ALSO
  
  =head1 HISTORY
  
  Initial version by Patrick R. Michaud, 2004.11.16
  
  =cut
  
  /*
   * Local variables:
   * c-indentation-style: bsd
   * c-basic-offset: 4
   * indent-tabs-mode: nil
   * End:
   *
   * vim: expandtab shiftwidth=4:
   */

cvs commit: parrot/compilers/p6ge Makefile README demo.pir p6ge.pir p6ge_gen.c p6ge_parse.c p6ge_parsep5.c

Reply via email to