> -----Original Message-----
> From: Theo Van Dinter [mailto:[EMAIL PROTECTED] 
> Sent: Wednesday, January 14, 2004 12:27 PM
> To: Spamassassin Devel List
> Subject: Re: 2.62 this week?
> 
> 
> On Wed, Jan 14, 2004 at 01:10:18PM -0500, Duncan Findlay wrote:
> > Is there a fix for all the bayes poison stuff in 2.62? That's my 
> > biggest issue right now with 2.61.
> 
> Not really, there's no fix for it in 2.70 either.  Invisible 
> text and html/text differences are (relatively) easy to pick 
> out, but most of the bayes poisoning stuff I've seen is just 
> random visible valid words in mail, typically at the end.
> 
> I can't think of any rule that would pick up on that as a 
> trick unless we put in full language parsing.
> 

i hit those big blobs at the end with an eval i wrote.
BODY_RUNON_60_PLUS triggers on all those recent messages that have been
sneaking through with weird subjects, mis-spelled "vagira" and a blocks
of bayes poison at the botton.

# DETECT BODY RUNON'S USED TO POISON BAYES...
body BODY_RUNON_01_15           eval:runon_value('0.1','0.15')
describe BODY_RUNON_01_15       Consecutive Long Stringed Words found #1

body BODY_RUNON_15_20           eval:runon_value('0.15','0.20')
describe BODY_RUNON_15_20       Consecutive Long Stringed Words found #2

body BODY_RUNON_20_30           eval:runon_value('0.20','0.30')
describe BODY_RUNON_20_30       Consecutive Long Stringed Words found #3

body BODY_RUNON_30_60           eval:runon_value('0.30','0.60')
describe BODY_RUNON_30_60       Consecutive Long Stringed Words found #4

body BODY_RUNON_60_PLUS         eval:runon_value('0.60')
describe BODY_RUNON_60_PLUS     Consecutive Long Stringed Words found #5

############################################################
# BODY_RUNON_01_15 -- 41s/4h of 10971 corpus (6083s/4888h), 2004-01-08
############################################################
score BODY_RUNON_01_15 0.5

############################################################
# BODY_RUNON_15_20 -- 18s/1h of 10971 corpus (6083s/4888h), 2004-01-08
############################################################
score BODY_RUNON_15_20 1.5

############################################################
# BODY_RUNON_20_30 -- 12s/1h of 10971 corpus (6083s/4888h), 2004-01-08
############################################################
score BODY_RUNON_20_30 2.5

############################################################
# BODY_RUNON_30_60 -- 38s/2h of 10971 corpus (6083s/4888h), 2004-01-08
############################################################
score BODY_RUNON_30_60 3.5

############################################################
# BODY_RUNON_60_PLUS -- 94s/0h of 10971 corpus (6083s/4888h), 2004-01-08
############################################################
score BODY_RUNON_60_PLUS 4.5



here is my eval....



sub runon_value {
  my ($self, $body, $min, $max) = @_;

  if (!defined $min) {  $min = 0; }

  if (exists $self->{runon_value}) {
    return ($self->{runon_value} > $min) if (!defined $max);
    return ($self->{runon_value} > $min && $self->{runon_value} <=
$max);
  }

  if ($self->body_charset_is_likely_to_fp()) {
    $self->{uppercase} = 0; return 0;
  }

  my $weight = 0.0;
  $weight += 0.02 if grep(/\b(?:\w{5,}\s+){7}/,@{$body});
  $weight += 0.04 if grep(/\b(?:\w{5,}\s+){8}/,@{$body});
  $weight += 0.08 if grep(/\b(?:\w{5,}\s+){9}/,@{$body});
  $weight += 0.04 if grep(/\b(?:\w{6,}\s+){7}/,@{$body});
  $weight += 0.08 if grep(/\b(?:\w{6,}\s+){8}/,@{$body});
  $weight += 0.16 if grep(/\b(?:\w{6,}\s+){9}/,@{$body});
  $weight += 0.02 if grep(/\b(?:\w{7,}\s+){5}/,@{$body});
  $weight += 0.04 if grep(/\b(?:\w{7,}\s+){6}/,@{$body});
  $weight += 0.08 if grep(/\b(?:\w{7,}\s+){7}/,@{$body});
  $weight += 0.16 if grep(/\b(?:\w{7,}\s+){8}/,@{$body});
  $weight += 0.32 if grep(/\b(?:\w{7,}\s+){9}/,@{$body});
  $weight += 0.04 if grep(/\b(?:\w{8,}\s+){5}/,@{$body});
  $weight += 0.08 if grep(/\b(?:\w{8,}\s+){6}/,@{$body});
  $weight += 0.16 if grep(/\b(?:\w{8,}\s+){7}/,@{$body});
  $weight += 0.32 if grep(/\b(?:\w{8,}\s+){8}/,@{$body});
  $weight += 0.64 if grep(/\b(?:\w{8,}\s+){9}/,@{$body});
  $weight += 0.08 if grep(/\b(?:\w{9,}\s+){5}/,@{$body});
  $weight += 0.16 if grep(/\b(?:\w{9,}\s+){6}/,@{$body});
  $weight += 0.32 if grep(/\b(?:\w{9,}\s+){7}/,@{$body});
  $weight += 0.64 if grep(/\b(?:\w{9,}\s+){8}/,@{$body});
  $weight += 1.28 if grep(/\b(?:\w{9,}\s+){9}/,@{$body});

  $self->{runon_value} = $weight;

  return ($self->{runon_value} > $min) if (!defined $max);
  return ($self->{runon_value} > $min && $self->{runon_value} <= $max);
}

Reply via email to