Hello.
I think $msg->generate_msgid in spamassassin-4.0.1 is too slow.
This is called every time on learn, so it affects a lot.
I rewrote $msg->get_pristine_body_digest to compare.
My preference is replace_gr and io_string, but I understand that there
is a tradeoff with memory that needs to be considered.
Again, the current $msg->generate_msgid is too costly and I would be
happy to see it changed to something compatible and lightweight.
Thanks
#!/usr/local/bin/perl
#use Modern::Perl;
use 5.010; use strict; use warnings;
use Benchmark qw(:all);
use Digest::SHA qw(sha1_hex);
use Email::Simple;
use IO::All;
use IO::String;
use Number::Format qw(format_bytes);
my @eml_files = (
"Maildir/.ham/cur/__MASK_1K_eml__", # 1K
"Maildir/.ham/cur/__MASK_1M_eml__", # 1M
"Maildir/.ham/cur/__MASK_10M_eml__", # 10M
);
my @raws = map io($_)->binary->all, @eml_files;
my @bodies = map Email::Simple->new($_)->body."\n", @raws; #
$pristine_body
# Fri May 17 17:34:29 2024 [8941] info: message: pristine body digest:
e463af136d1a9c750d781b54d53232f751e09f72
# Fri May 17 17:34:30 2024 [8942] info: message: pristine body digest:
cc4240e82bc2a86088034e22523dff3b6701ac41
# Fri May 17 17:34:31 2024 [8941] info: message: pristine body digest:
5557f69add0d91632d2af7b2d4510161e21f688d
my @correct_digests = (
"e463af136d1a9c750d781b54d53232f751e09f72",
"cc4240e82bc2a86088034e22523dff3b6701ac41",
"5557f69add0d91632d2af7b2d4510161e21f688d",
);
my %func = (
sa_orig => sub { #
Mail::SpamAssassin::Message::get_pristine_body_digest
my $ref = $_[0];
my $sha = Digest::SHA->new('sha1');
while ($$ref =~ /(.*?)(\015\012)?/gs) {
$sha->add($1.(defined $2 ? "\012" : ""));
}
$sha->hexdigest;
},
replace_gr => sub {
my $ref = $_[0];
sha1_hex( $$ref =~ s/\015\012/\012/gr );
},
while_refine => sub {
my $ref = $_[0];
my $sha = Digest::SHA->new('sha1');
while ($$ref =~ /(.*?)(\015\012|\012|\z)/g) {
$sha->add($1.(length $2 ? "\012" : ""));
}
$sha->hexdigest;
},
io_string => sub {
my $ref = $_[0];
my $sha = Digest::SHA->new('sha1');
my $io = IO::String->new($$ref);
while (<$io>) {
$_ =~ s/\015\012\z/\012/;
$sha->add($_);
}
$sha->hexdigest;
},
add_line => sub {
my $ref = $_[0];
my $sha = Digest::SHA->new('sha1');
my $pos = 0;
while (1) {
my $idx = index($$ref,"\012",$pos);
if ($idx < 0) {
$sha->add( substr($$ref, $pos) =~ s/\015\012\z/\012/r );
last;
}
else {
my $len = $idx - $pos + length($/);
$sha->add( substr($$ref, $pos, $len) =~
s/\015\012\z/\012/r );
$pos += $len;
}
}
$sha->hexdigest;
},
add_chunk => sub {
my $ref = $_[0];
my $sha = Digest::SHA->new('sha1');
my $buf = "";
my $pos = 0;
while (1) {
my $idx = index($$ref,"\012",$pos);
if ($idx < 0) {
$buf .= substr($$ref, $pos) =~ s/\015\012\z/\012/r;
$sha->add($buf);
last;
}
else {
my $len = $idx - $pos + length($/);
$buf .= substr($$ref, $pos, $len) =~
s/\015\012\z/\012/r;
if (length $buf > 64*1024) {
$sha->add($buf);
$buf = "";
}
$pos += $len;
}
}
$sha->hexdigest;
},
);
local $|=1;
if (0) {
say "check to make sure calculations are correct...";
for my $func_name (keys %func) {
for my $i (0..$#bodies) {
my $correct_digest = $correct_digests[$i];
my $pristine_body_ref = \$bodies[$i];
die $func_name unless
$func{$func_name}->($pristine_body_ref) eq $correct_digest;
}
}
say "ok.";
}
for my $i (0..$#bodies) {
say "--> cmpthese... length:@{[ format_bytes length $raws[$i] ]}";
my $count = -1;
my %h = map { my $code = $func{$_}; $_ => sub {
$code->(\$bodies[$i]) } } keys %func;
cmpthese($count, \%h);
say "";
}
__END__
# junk/sa_msgid_bench.pl
--> cmpthese... length:1,017
Rate sa_orig io_string while_refine add_line add_chunk
replace_gr
sa_orig 2879/s -- -87% -92% -95% -95%
-99%
io_string 22040/s 666% -- -41% -59% -64%
-94%
while_refine 37560/s 1205% 70% -- -30% -39%
-90%
add_line 53748/s 1767% 144% 43% -- -13%
-85%
add_chunk 62028/s 2055% 181% 65% 15% --
-83%
replace_gr 361355/s 12452% 1540% 862% 572% 483%
--
--> cmpthese... length:1.03M
(warning: too few iterations for a reliable count)
Rate sa_orig while_refine io_string add_line add_chunk
replace_gr
sa_orig 0.440/s -- -95% -97% -99% -99%
-100%
while_refine 9.21/s 1994% -- -38% -72% -81%
-97%
io_string 14.9/s 3299% 62% -- -55% -69%
-95%
add_line 32.9/s 7389% 258% 120% -- -31%
-88%
add_chunk 48.0/s 10812% 421% 221% 46% --
-83%
replace_gr 279/s 63431% 2935% 1769% 748% 482%
--
--> cmpthese... length:9.75M
(warning: too few iterations for a reliable count)
(warning: too few iterations for a reliable count)
(warning: too few iterations for a reliable count)
Rate sa_orig while_refine io_string add_line
add_chunk replace_gr
sa_orig 4.52e-02/s -- -95% -97% -99%
-99% -100%
while_refine 0.948/s 1999% -- -44% -73%
-82% -97%
io_string 1.70/s 3652% 79% -- -52%
-68% -94%
add_line 3.56/s 7769% 275% 110% --
-33% -87%
add_chunk 5.33/s 11704% 462% 215% 50%
-- -81%
replace_gr 27.9/s 61672% 2844% 1546% 685%
423% --