Re: Programming Email Filters

Peter Haworth Tue, 02 Sep 2003 06:37:23 -0700

On Mon, 1 Sep 2003 14:09:58 -0700, Dave Cross wrote:
> The first problem is identifying why the email ended up in my inbox. I
> need to work out which of the many email addresses in the many headers is
> aimed at me. Here's the algorithm I'm using.
>
> 1/ If there's an 'Envelope-to' header then use that and stop looking.


Convince your local MDA to always insert one (such as qmail's
Delivered-to:). Then you only need this step. The other message headers are
all lies, anyway.

I'm in the fortunate position of being the only user on my mail server, so I
can reject messages which aren't addressed to me at SMTP RCPT time, and I
don't even have to receive them in the first place. Of course, I still get
hundreds of bounces addressed to me, due to the current virus/spam
addressing trend, but this did cut down the flood quite considerably.

> I'm using the Email::* modules but there doesn't seem to be a way to
> extract the actual deliverable email address from the headers.
>
> Should I just write Email::Address and submit it to the Email::* project?

I've written a yapp grammar for my MUA, which (from memory) parses
everything in RFC822 addresses (not necessarily 2822 - it didn't exist)
except for domain literals. While it successfully identifies addresses
correctly, constructing sensible values for comments really only works for
comments which were sensible to start with:

  "Dave Cross" <[EMAIL PROTECTED]>
    [EMAIL PROTECTED]
    name=Dave Cross
    comment=
  Muhammed.(I am  the greatest) Ali @(the)Vegas.WBA
    [EMAIL PROTECTED]
    name=
    comment= (I am the greatest) (the)

At the risk of annoying the list admins, I've attached it to this message,
since I still haven't got round to putting the MUA on CPAN.

-- 
        Peter Haworth   [EMAIL PROTECTED]
"The usability of a computer language is inversely proportional to the
 number of theoretical axes the language designer tries to grind."
        -- Larry Wall

# $Revision: 1.9 $

%token tComma tColon tSemi
%token tAngLeft tAngRight
%token tAt tDot
%token tAtom tQuotedString tQuotedPair

%%

addresses:
  address
    { [ $_[1] ] }
| addresses tComma address
    { [ @{$_[1]},$_[3] ] }
;

address:
  address_
    { 
      $_[0]->ParseComments;
      my $data=$_[0]->YYData;

      my $addr={
        addr => $_[1],
        comment => $data->{COMMENT},
        text => $data->{TEXT},
        name => $data->{NAME},
      };
      delete $data->{COMMENT};
      delete $data->{TEXT};
      delete $data->{NAME};
      $addr->{name}=~s/^\s+//s;
      $addr;
    }
;

address_:
  group
| mailbox
;

group:
  phrase tColon mailboxes tSemi
;

mailboxes:
  mailbox
| mailboxes tComma mailbox
;

mailbox:
  addr_spec
| opt_phrase route_addr
    { $_[0]->YYData->{NAME}.=" $_[1]"; $_[2] }
;

addr_spec:
  local_part tAt domain
    { "$_[1]$_[2]$_[3]" }
;

opt_phrase:
| phrase
;

phrase:
  word
| phrase word
    { "$_[1] $_[2]" }
;

route_addr:
  tAngLeft opt_route addr_spec tAngRight
    { $_[3] } # XXX Ignore route for now
;

opt_route:
  routes tColon
|
;

routes:
  routes tAt domain
| tAt domain
;

local_part:
  local_part tDot word
    { "$_[1]$_[2]$_[3]" }
| word
;

domain:
  domain tDot sub_domain
    { "$_[1]$_[2]$_[3]" }
| sub_domain
;

sub_domain:
  domain_ref
/* | domain_literal */
;

domain_ref:
  tAtom
;

word:
  tAtom
| tQuotedString
;


%%

my %tokens=reverse(
  tComma => ',',
  tColon => ':',
  tSemi => ';',
  tAngLeft => '<',
  tAngRight => '>',
  tParLeft => '(',
  tParRight => ')',
  tBraLeft => '[',
  tBraRight => ']',
  tAt => '@',
  tDot => '.',
);
my $tokens=join '',keys %tokens;

# Remove whitespace and comments
# This is done outside the lexer, since we call it before the first token
sub ParseComments{
  my($parser)[EMAIL PROTECTED];
  my $data=$parser->YYData;

  for($data->{INPUT}){
    while(s/^(\s+)// || /^\(/){
      $data->{TEXT}.=$1;
      if(s/^\(//){
        my $level=1;
        my $ctext='(';
        while($level){
          s/^([^()\\]+)//
            and $ctext.=$1;
          s/^((?:\\.)+)//
            and $ctext.=$1;
          s/^\(//
            and $ctext.='(' and ++$level;
          if(s/^\)//){
            $ctext.=')';
            last unless --$level;
          }
        }
        $data->{COMMENT}.=" $ctext";
        $data->{TEXT}.=$ctext;
      }
    }
  }
}

# Debugging version
sub __Lexer{
  my($parser)[EMAIL PROTECTED];
  my @ret=&_Lexer;

  local $"=',';
  warn "Lex returned: (@ret)\n";
  @ret;
}

sub _Lexer{
  my($parser)[EMAIL PROTECTED];
  my $data=$parser->YYData;

  # Remove whitespace and comments
  $parser->ParseComments;

  # Determine next token
  for($data->{INPUT}){
    return ('',undef) if $_ eq '';

    if(s/^([\Q$tokens\E])//o){
      $data->{TEXT}.=$1 unless $1 eq ',';
      return ($tokens{$1},$1);
    }
    if(s/^"//){
      my $str;
      while(1){
        if(s/^"//){
          $data->{TEXT}.=qq("$str");
          return (tQuotedString => $str);
        }elsif(s/^\\(.)//s){
          $str.=$1;
        }elsif(s/^([^\\"]+)//){
          $str.=$1;
        }else{
          $data->{TEXT}.=qq("$str");
          return (tQuotedString => $str);
        }
      }
    }
    if(s/^\\(.)//s){
      $data->{TEXT}.="\\$1";
      return (tQuotedPair => $1);
    }
    if(s/^([^\s\000-\037()<>\@,;\\".\[\]]+)//){
      $data->{TEXT}.=$1;
      return (tAtom => $1);
    }
  }
  if(s/^(.)//s){
    $data->{TEXT}.=$1;
    return (tUnknown => $1);
  }
}

sub _Error{
  my($self)[EMAIL PROTECTED];

  # XXX We're ignoring errors for now


#  warn "Invalid message id in ".$self->YYData->{ALLINPUT}."\n";
}

sub Run{
  my($self,$data)[EMAIL PROTECTED];
  my $dref=$self->YYData;
  %$dref=();
  $dref->{ALLINPUT}=$dref->{INPUT}=$data;

  $self->YYParse(
    yylex => \&_Lexer,
    yyerror => \&_Error,
#    yydebug => 0x1f,
  );
}

Re: Programming Email Filters

Reply via email to