> -----Original Message-----
> From: Theo Van Dinter [mailto:[EMAIL PROTECTED]
> Sent: Wednesday, January 14, 2004 12:27 PM
> To: Spamassassin Devel List
> Subject: Re: 2.62 this week?
>
>
> On Wed, Jan 14, 2004 at 01:10:18PM -0500, Duncan Findlay wrote:
> > Is there a fix for all the bayes poison stuff in 2.62? That's my
> > biggest issue right now with 2.61.
>
> Not really, there's no fix for it in 2.70 either. Invisible
> text and html/text differences are (relatively) easy to pick
> out, but most of the bayes poisoning stuff I've seen is just
> random visible valid words in mail, typically at the end.
>
> I can't think of any rule that would pick up on that as a
> trick unless we put in full language parsing.
>
i hit those big blobs at the end with an eval i wrote.
BODY_RUNON_60_PLUS triggers on all those recent messages that have been
sneaking through with weird subjects, mis-spelled "vagira" and a blocks
of bayes poison at the botton.
# DETECT BODY RUNON'S USED TO POISON BAYES...
body BODY_RUNON_01_15 eval:runon_value('0.1','0.15')
describe BODY_RUNON_01_15 Consecutive Long Stringed Words found #1
body BODY_RUNON_15_20 eval:runon_value('0.15','0.20')
describe BODY_RUNON_15_20 Consecutive Long Stringed Words found #2
body BODY_RUNON_20_30 eval:runon_value('0.20','0.30')
describe BODY_RUNON_20_30 Consecutive Long Stringed Words found #3
body BODY_RUNON_30_60 eval:runon_value('0.30','0.60')
describe BODY_RUNON_30_60 Consecutive Long Stringed Words found #4
body BODY_RUNON_60_PLUS eval:runon_value('0.60')
describe BODY_RUNON_60_PLUS Consecutive Long Stringed Words found #5
############################################################
# BODY_RUNON_01_15 -- 41s/4h of 10971 corpus (6083s/4888h), 2004-01-08
############################################################
score BODY_RUNON_01_15 0.5
############################################################
# BODY_RUNON_15_20 -- 18s/1h of 10971 corpus (6083s/4888h), 2004-01-08
############################################################
score BODY_RUNON_15_20 1.5
############################################################
# BODY_RUNON_20_30 -- 12s/1h of 10971 corpus (6083s/4888h), 2004-01-08
############################################################
score BODY_RUNON_20_30 2.5
############################################################
# BODY_RUNON_30_60 -- 38s/2h of 10971 corpus (6083s/4888h), 2004-01-08
############################################################
score BODY_RUNON_30_60 3.5
############################################################
# BODY_RUNON_60_PLUS -- 94s/0h of 10971 corpus (6083s/4888h), 2004-01-08
############################################################
score BODY_RUNON_60_PLUS 4.5
here is my eval....
sub runon_value {
my ($self, $body, $min, $max) = @_;
if (!defined $min) { $min = 0; }
if (exists $self->{runon_value}) {
return ($self->{runon_value} > $min) if (!defined $max);
return ($self->{runon_value} > $min && $self->{runon_value} <=
$max);
}
if ($self->body_charset_is_likely_to_fp()) {
$self->{uppercase} = 0; return 0;
}
my $weight = 0.0;
$weight += 0.02 if grep(/\b(?:\w{5,}\s+){7}/,@{$body});
$weight += 0.04 if grep(/\b(?:\w{5,}\s+){8}/,@{$body});
$weight += 0.08 if grep(/\b(?:\w{5,}\s+){9}/,@{$body});
$weight += 0.04 if grep(/\b(?:\w{6,}\s+){7}/,@{$body});
$weight += 0.08 if grep(/\b(?:\w{6,}\s+){8}/,@{$body});
$weight += 0.16 if grep(/\b(?:\w{6,}\s+){9}/,@{$body});
$weight += 0.02 if grep(/\b(?:\w{7,}\s+){5}/,@{$body});
$weight += 0.04 if grep(/\b(?:\w{7,}\s+){6}/,@{$body});
$weight += 0.08 if grep(/\b(?:\w{7,}\s+){7}/,@{$body});
$weight += 0.16 if grep(/\b(?:\w{7,}\s+){8}/,@{$body});
$weight += 0.32 if grep(/\b(?:\w{7,}\s+){9}/,@{$body});
$weight += 0.04 if grep(/\b(?:\w{8,}\s+){5}/,@{$body});
$weight += 0.08 if grep(/\b(?:\w{8,}\s+){6}/,@{$body});
$weight += 0.16 if grep(/\b(?:\w{8,}\s+){7}/,@{$body});
$weight += 0.32 if grep(/\b(?:\w{8,}\s+){8}/,@{$body});
$weight += 0.64 if grep(/\b(?:\w{8,}\s+){9}/,@{$body});
$weight += 0.08 if grep(/\b(?:\w{9,}\s+){5}/,@{$body});
$weight += 0.16 if grep(/\b(?:\w{9,}\s+){6}/,@{$body});
$weight += 0.32 if grep(/\b(?:\w{9,}\s+){7}/,@{$body});
$weight += 0.64 if grep(/\b(?:\w{9,}\s+){8}/,@{$body});
$weight += 1.28 if grep(/\b(?:\w{9,}\s+){9}/,@{$body});
$self->{runon_value} = $weight;
return ($self->{runon_value} > $min) if (!defined $max);
return ($self->{runon_value} > $min && $self->{runon_value} <= $max);
}