Nice! On Sun, Jul 26, 2020, 01:50 <h...@apache.org> wrote:
> Author: hege > Date: Sun Jul 26 05:50:00 2020 > New Revision: 1880308 > > URL: http://svn.apache.org/viewvc?rev=1880308&view=rev > Log: > Tweaks to increase speed, cut runtime in half > > Modified: > spamassassin/trunk/masses/hit-frequencies > > Modified: spamassassin/trunk/masses/hit-frequencies > URL: > http://svn.apache.org/viewvc/spamassassin/trunk/masses/hit-frequencies?rev=1880308&r1=1880307&r2=1880308&view=diff > > ============================================================================== > --- spamassassin/trunk/masses/hit-frequencies (original) > +++ spamassassin/trunk/masses/hit-frequencies Sun Jul 26 05:50:00 2020 > @@ -805,52 +805,48 @@ sub compute_overlaps_for_rule { > my %overlaps_ham1r = (); > my %overlaps_spam1r = (); > > - foreach my $r2 (keys %hmap_spam) { > - next if $r1 eq $r2; > - > - # require that both rules have at least 1 hit > - next unless ($freq_spam{$r1} && $freq_spam{$r2}); > - > - my ($a1ina2, $a2ina1) = _hmap_to_overlap_ratio ($r2, $r1, > - $hmap_spam{$r2}, $hmap_spam{$r1}); > - > - if ($a1ina2 > 0) > - { > - $overlaps_spam1r{$r2} = $a1ina2; > - > - if (exists $overlaps_spam1{$a1ina2}) > - { $overlaps_spam1{$a1ina2} .= " ".$r2."[$a2ina1]"; } > - else { $overlaps_spam1{$a1ina2} = $r2."[$a2ina1]"; } > - > - if (exists $overlaps_spam2{$a2ina1}) > - { $overlaps_spam2{$a2ina1} .= " ".$r2."[$a2ina1]"; } > - else { $overlaps_spam2{$a2ina1} = $r2."[$a2ina1]"; } > + if ($freq_spam{$r1}) { > + foreach my $r2 (keys %hmap_spam) { > + next if $r1 eq $r2; > + > + my ($a1ina2, $a2ina1) = _hmap_to_overlap_ratio ($r2, $r1, > + $hmap_spam{$r2}, $hmap_spam{$r1}); > + > + if ($a1ina2 > 0) > + { > + $overlaps_spam1r{$r2} = $a1ina2; > + > + if (exists $overlaps_spam1{$a1ina2}) > + { $overlaps_spam1{$a1ina2} .= " ".$r2."[$a2ina1]"; } > + else { $overlaps_spam1{$a1ina2} = $r2."[$a2ina1]"; } > + > + if (exists $overlaps_spam2{$a2ina1}) > + { $overlaps_spam2{$a2ina1} .= " ".$r2."[$a2ina1]"; } > + else { $overlaps_spam2{$a2ina1} = $r2."[$a2ina1]"; } > + } > } > - > } > > - foreach my $r2 (keys %hmap_ham) { > - next if $r1 eq $r2; > - > - # require that both rules have at least 1 hit > - next unless ($freq_ham{$r1} && $freq_ham{$r2}); > - > - my ($a1ina2, $a2ina1) = _hmap_to_overlap_ratio ($r1, $r2, > - $hmap_ham{$r2}, $hmap_ham{$r1}); > - > - if ($a1ina2 > 0) > - { > - $overlaps_ham1r{$r2} = $a1ina2; > - > - if (exists $overlaps_ham1{$a1ina2}) > - { $overlaps_ham1{$a1ina2} .= " ".$r2."[$a2ina1]"; } > - else { $overlaps_ham1{$a1ina2} = $r2."[$a2ina1]"; } > - > - if (exists $overlaps_ham2{$a2ina1}) > - { $overlaps_ham2{$a2ina1} .= " ".$r2."[$a1ina2]"; } > - else { $overlaps_ham2{$a2ina1} = $r2."[$a1ina2]"; } > + if ($freq_ham{$r1}) { > + foreach my $r2 (keys %hmap_ham) { > + next if $r1 eq $r2; > + > + my ($a1ina2, $a2ina1) = _hmap_to_overlap_ratio ($r1, $r2, > + $hmap_ham{$r2}, $hmap_ham{$r1}); > + > + if ($a1ina2 > 0) > + { > + $overlaps_ham1r{$r2} = $a1ina2; > + > + if (exists $overlaps_ham1{$a1ina2}) > + { $overlaps_ham1{$a1ina2} .= " ".$r2."[$a2ina1]"; } > + else { $overlaps_ham1{$a1ina2} = $r2."[$a2ina1]"; } > + > + if (exists $overlaps_ham2{$a2ina1}) > + { $overlaps_ham2{$a2ina1} .= " ".$r2."[$a1ina2]"; } > + else { $overlaps_ham2{$a2ina1} = $r2."[$a1ina2]"; } > + } > } > - > } > > _print_overlap_ratios($r1, \%overlaps_spam1, \%overlaps_spam2, "spam", > \%overlaps_ham1r, "ham"); > @@ -934,25 +930,23 @@ sub _prettify_overlap_rules { > sub _hmap_to_overlap_ratio { > my ($r1, $r2, $hmap1, $hmap2) = @_; > > - $hmap1 ||= ''; > - $hmap2 ||= ''; > - if ($hmap1 !~ /[^\000]/ || $hmap2 !~ /[^\000]/) { > - # no hits on either! this would normally give a 100% hitrate match, > - # but that's misleading -- so hide it by giving it a 0% overlap. > - # > - # also, ignore cases where there are no hits on *one* of the rules, > - # while there are hits on the other -- after all, if one rule doesn't > - # have a single hit, it cannot overlap. > - # > - return (0,0); > - } > - > # my $i; for ($i = 0; $i < length($hmap1)*8; $i++) { print > vec($hmap1,$i,1); } print "\n"; for ($i = 0; $i < length($hmap2)*8; $i++) { > print vec($hmap2,$i,1); } print "\n"; > > # count bits in each, so we can show when one is fully subsumed by > another > # with perl's support for bitstring ops, we get C speed here, nice! > + > + # no hits on either? this would normally give a 100% hitrate match, > + # but that's misleading -- so hide it by giving it a 0% overlap. > + # > + # also, ignore cases where there are no hits on *one* of the rules, > + # while there are hits on the other -- after all, if one rule doesn't > + # have a single hit, it cannot overlap. > + > my $a1 = unpack("%32b*", $hmap1); > + return (0,0) unless $a1; > my $a2 = unpack("%32b*", $hmap2); > + return (0,0) unless $a2; > + > my $a1_and_a2 = unpack("%32b*", ($hmap1 & $hmap2)); > > # round rather than truncate > > >