Hello.

I think $msg->generate_msgid in spamassassin-4.0.1 is too slow.

This is called every time on learn, so it affects a lot.

I rewrote $msg->get_pristine_body_digest to compare.

My preference is replace_gr and io_string, but I understand that there is a tradeoff with memory that needs to be considered.

Again, the current $msg->generate_msgid is too costly and I would be happy to see it changed to something compatible and lightweight.

Thanks

#!/usr/local/bin/perl

#use Modern::Perl;
use 5.010; use strict; use warnings;
use Benchmark qw(:all);
use Digest::SHA qw(sha1_hex);
use Email::Simple;
use IO::All;
use IO::String;
use Number::Format qw(format_bytes);

my @eml_files = (
    "Maildir/.ham/cur/__MASK_1K_eml__", # 1K
    "Maildir/.ham/cur/__MASK_1M_eml__", # 1M
    "Maildir/.ham/cur/__MASK_10M_eml__", # 10M
);
my @raws   = map io($_)->binary->all, @eml_files;
my @bodies = map Email::Simple->new($_)->body."\n", @raws; # $pristine_body # Fri May 17 17:34:29 2024 [8941] info: message: pristine body digest: e463af136d1a9c750d781b54d53232f751e09f72 # Fri May 17 17:34:30 2024 [8942] info: message: pristine body digest: cc4240e82bc2a86088034e22523dff3b6701ac41 # Fri May 17 17:34:31 2024 [8941] info: message: pristine body digest: 5557f69add0d91632d2af7b2d4510161e21f688d
my @correct_digests = (
    "e463af136d1a9c750d781b54d53232f751e09f72",
    "cc4240e82bc2a86088034e22523dff3b6701ac41",
    "5557f69add0d91632d2af7b2d4510161e21f688d",
);

my %func = (
sa_orig => sub { # Mail::SpamAssassin::Message::get_pristine_body_digest
        my $ref = $_[0];
        my $sha = Digest::SHA->new('sha1');
        while ($$ref =~ /(.*?)(\015\012)?/gs) {
            $sha->add($1.(defined $2 ? "\012" : ""));
        }
        $sha->hexdigest;
    },
    replace_gr => sub {
        my $ref = $_[0];
        sha1_hex( $$ref =~ s/\015\012/\012/gr );
    },
    while_refine => sub {
        my $ref = $_[0];
        my $sha = Digest::SHA->new('sha1');
        while ($$ref =~ /(.*?)(\015\012|\012|\z)/g) {
            $sha->add($1.(length $2 ? "\012" : ""));
        }
        $sha->hexdigest;
    },
    io_string => sub {
        my $ref = $_[0];
        my $sha = Digest::SHA->new('sha1');
        my $io = IO::String->new($$ref);
        while (<$io>) {
            $_ =~ s/\015\012\z/\012/;
            $sha->add($_);
        }
        $sha->hexdigest;
    },
    add_line => sub {
        my $ref = $_[0];
        my $sha = Digest::SHA->new('sha1');
        my $pos = 0;
        while (1) {
            my $idx = index($$ref,"\012",$pos);
            if ($idx < 0) {
                $sha->add( substr($$ref, $pos) =~ s/\015\012\z/\012/r );
                last;
            }
            else {
                my $len = $idx - $pos + length($/);
$sha->add( substr($$ref, $pos, $len) =~ s/\015\012\z/\012/r );
                $pos += $len;
            }
        }
        $sha->hexdigest;
    },
    add_chunk => sub {
        my $ref = $_[0];
        my $sha = Digest::SHA->new('sha1');
        my $buf = "";
        my $pos = 0;
        while (1) {
            my $idx = index($$ref,"\012",$pos);
            if ($idx < 0) {
                $buf .= substr($$ref, $pos) =~ s/\015\012\z/\012/r;
                $sha->add($buf);
                last;
            }
            else {
                my $len = $idx - $pos + length($/);
$buf .= substr($$ref, $pos, $len) =~ s/\015\012\z/\012/r;
                if (length $buf > 64*1024) {
                    $sha->add($buf);
                    $buf = "";
                }
                $pos += $len;
            }
        }
        $sha->hexdigest;
    },
);

local $|=1;

if (0) {
    say "check to make sure calculations are correct...";
    for my $func_name (keys %func) {
        for my $i (0..$#bodies) {
            my $correct_digest    = $correct_digests[$i];
            my $pristine_body_ref = \$bodies[$i];
die $func_name unless $func{$func_name}->($pristine_body_ref) eq $correct_digest;
        }
    }
    say "ok.";
}

for my $i (0..$#bodies) {
    say "--> cmpthese... length:@{[ format_bytes length $raws[$i] ]}";
    my $count = -1;
my %h = map { my $code = $func{$_}; $_ => sub { $code->(\$bodies[$i]) } } keys %func;
    cmpthese($count, \%h);
    say "";
}

__END__

# junk/sa_msgid_bench.pl
--> cmpthese... length:1,017
Rate sa_orig io_string while_refine add_line add_chunk replace_gr sa_orig 2879/s -- -87% -92% -95% -95% -99% io_string 22040/s 666% -- -41% -59% -64% -94% while_refine 37560/s 1205% 70% -- -30% -39% -90% add_line 53748/s 1767% 144% 43% -- -13% -85% add_chunk 62028/s 2055% 181% 65% 15% -- -83% replace_gr 361355/s 12452% 1540% 862% 572% 483% --

--> cmpthese... length:1.03M
            (warning: too few iterations for a reliable count)
Rate sa_orig while_refine io_string add_line add_chunk replace_gr sa_orig 0.440/s -- -95% -97% -99% -99% -100% while_refine 9.21/s 1994% -- -38% -72% -81% -97% io_string 14.9/s 3299% 62% -- -55% -69% -95% add_line 32.9/s 7389% 258% 120% -- -31% -88% add_chunk 48.0/s 10812% 421% 221% 46% -- -83% replace_gr 279/s 63431% 2935% 1769% 748% 482% --

--> cmpthese... length:9.75M
            (warning: too few iterations for a reliable count)
            (warning: too few iterations for a reliable count)
            (warning: too few iterations for a reliable count)
Rate sa_orig while_refine io_string add_line add_chunk replace_gr sa_orig 4.52e-02/s -- -95% -97% -99% -99% -100% while_refine 0.948/s 1999% -- -44% -73% -82% -97% io_string 1.70/s 3652% 79% -- -52% -68% -94% add_line 3.56/s 7769% 275% 110% -- -33% -87% add_chunk 5.33/s 11704% 462% 215% 50% -- -81% replace_gr 27.9/s 61672% 2844% 1546% 685% 423% --

Reply via email to