On Wed, 30 May 2001 17:14:15 +0100, Matthew Robinson wrote:
> RFC822 will allow all of the following (taken from CGI Programming with
> Perl) and was designed to accept all the addresses in use in 1982:
> 
> Alfred Neuman <Neuman@BBN-TENEXA>
> ":sysmail"@ Some-Group. Some-Org
> Muhammed.(I am the greatest) Ali @(the)vegas.WBA

Attached is the address parser from my mail client (which I might eventually release). 
It returns an arrayref of hashrefs, containing:
  addr => The actual address (minus comments)
  comment => All the comments
  text => The whole text of the address
  name => The name

If I parse q(Alfred Neuman <Neuman@BBN-TENEXA>, ":sysmail"@ Some-Group. Some-Org, 
Muhammed.(I am the greatest) Ali @(the)vegas.WBA) with it, I get this back:

$VAR1 = [
          {
            'text' => 'Alfred Neuman <Neuman@BBN-TENEXA>',
            'comment' => undef,
            'addr' => 'Neuman@BBN-TENEXA',
            'name' => 'Alfred Neuman'
          },
          {
            'text' => ' ":sysmail"@ Some-Group. Some-Org ',
            'comment' => undef,
            'addr' => ':[EMAIL PROTECTED]',
            'name' => 'Alfred Neuman'
          },
          {
            'text' => 'Muhammed.(I am the greatest) Ali @(the)vegas.WBA',
            'comment' => ' (I am the greatest) (the)',
            'addr' => '[EMAIL PROTECTED]',
            'name' => 'Alfred Neuman'
          }
        ];

Oooh, look! It's broken! Oh well, back to the drawing board.

-- 
        Peter Haworth   [EMAIL PROTECTED]
``Shall we have perl yell if the string "Matt Wright"
  is found in a comment when running under -w too?''
                -- Dan Sugalski
# $Revision: 1.8 $

%token tComma tColon tSemi
%token tAngLeft tAngRight
%token tAt tDot
%token tAtom tQuotedString tQuotedPair

%%

addresses:
  address
    { [ $_[1] ] }
| addresses tComma address
    { [ @{$_[1]},$_[3] ] }
;

address:
  address_
    { 
      $_[0]->ParseComments;
      my $data=$_[0]->YYData;

      my $addr={
        addr => $_[1],
        comment => $data->{COMMENT},
        text => $data->{TEXT},
        name => $data->{NAME},
      };
      delete $data->{COMMENT};
      delete $data->{TEXT};
      $addr->{name}=~s/^\s+//s;
      $addr;
    }
;

address_:
  group
| mailbox
;

group:
  phrase tColon mailboxes tSemi
;

mailboxes:
  mailbox
| mailboxes tComma mailbox
;

mailbox:
  addr_spec
| opt_phrase route_addr
    { $_[0]->YYData->{NAME}.=" $_[1]"; $_[2] }
;

addr_spec:
  local_part tAt domain
    { "$_[1]$_[2]$_[3]" }
;

opt_phrase:
| phrase
;

phrase:
  word
| phrase word
    { "$_[1] $_[2]" }
;

route_addr:
  tAngLeft opt_route addr_spec tAngRight
    { $_[3] } # XXX Ignore route for now
;

opt_route:
  routes tColon
|
;

routes:
  routes tAt domain
| tAt domain
;

local_part:
  local_part tDot word
    { "$_[1]$_[2]$_[3]" }
| word
;

domain:
  domain tDot sub_domain
    { "$_[1]$_[2]$_[3]" }
| sub_domain
;

sub_domain:
  domain_ref
/* | domain_literal */
;

domain_ref:
  tAtom
;

word:
  tAtom
| tQuotedString
;


%%

my %tokens=reverse(
  tComma => ',',
  tColon => ':',
  tSemi => ';',
  tAngLeft => '<',
  tAngRight => '>',
  tParLeft => '(',
  tParRight => ')',
  tBraLeft => '[',
  tBraRight => ']',
  tAt => '@',
  tDot => '.',
);
my $tokens=join '',keys %tokens;

# Remove whitespace and comments
# This is done outside the lexer, since we call it before the first token
sub ParseComments{
  my($parser)=@_;
  my $data=$parser->YYData;

  for($data->{INPUT}){
    while(s/^(\s+)// || /^\(/){
      $data->{TEXT}.=$1;
      if(s/^\(//){
        my $level=1;
        my $ctext='(';
        while($level){
          s/^([^()\\]+)//
            and $ctext.=$1;
          s/^((?:\\.)+)//
            and $ctext.=$1;
          s/^\(//
            and $ctext.='(' and ++$level;
          if(s/^\)//){
            $ctext.=')';
            last unless --$level;
          }
        }
        $data->{COMMENT}.=" $ctext";
        $data->{TEXT}.=$ctext;
      }
    }
  }
}

# Debugging version
sub __Lexer{
  my($parser)=@_;
  my @ret=&_Lexer;

  local $"=',';
  warn "Lex returned: (@ret)\n";
  @ret;
}

sub _Lexer{
  my($parser)=@_;
  my $data=$parser->YYData;

  # Remove whitespace and comments
  $parser->ParseComments;

  # Determine next token
  for($data->{INPUT}){
    return ('',undef) if $_ eq '';

    if(s/^([\Q$tokens\E])//o){
      $data->{TEXT}.=$1 unless $1 eq ',';
      return ($tokens{$1},$1);
    }
    if(s/^"//){
      my $str;
      while(1){
        if(s/^"//){
          $data->{TEXT}.=qq("$str");
          return (tQuotedString => $str);
        }elsif(s/^\\(.)//s){
          $str.=$1;
        }elsif(s/^([^\\"]+)//){
          $str.=$1;
        }else{
          $data->{TEXT}.=qq("$str");
          return (tQuotedString => $str);
        }
      }
    }
    if(s/^\\(.)//s){
      $data->{TEXT}.="\\$1";
      return (tQuotedPair => $1);
    }
    if(s/^([^\s\000-\037()<>\@,;\\".\[\]]+)//){
      $data->{TEXT}.=$1;
      return (tAtom => $1);
    }
  }
  if(s/^(.)//s){
    $data->{TEXT}.=$1;
    return (tUnknown => $1);
  }
}

sub _Error{
  my($self)=@_;

  # XXX We're ignoring errors for now


#  warn "Invalid message id in ".$self->YYData->{ALLINPUT}."\n";
}

sub Run{
  my($self,$data)=@_;
  my $dref=$self->YYData;
  %$dref=();
  $dref->{ALLINPUT}=$dref->{INPUT}=$data;

  $self->YYParse(
    yylex => \&_Lexer,
    yyerror => \&_Error,
#    yydebug => 0x1f,
  );
}



Reply via email to