Hi, cPanel has developed a native Perl Pyzor implementation for SpamAssassin and a diff against SpamAssassin 4.0 follows. Atm I am using it in production on a small server, more tests and opinions are welcome.
Original cPanel code is at https://metacpan.org/pod/Mail::Pyzor. Cheers Giovanni diff --git a/MANIFEST b/MANIFEST index 25d0192..2d9588c 100644 --- a/MANIFEST +++ b/MANIFEST @@ -126,6 +126,11 @@ lib/Mail/SpamAssassin/Plugin/WLBLEval.pm lib/Mail/SpamAssassin/Plugin/WhiteListSubject.pm lib/Mail/SpamAssassin/PluginHandler.pm lib/Mail/SpamAssassin/Plugin/URILocalBL.pm +lib/Mail/SpamAssassin/Pyzor/Client.pm +lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm +lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm +lib/Mail/SpamAssassin/Pyzor/Digest.pm +lib/Mail/SpamAssassin/Pyzor.pm lib/Mail/SpamAssassin/RegistryBoundaries.pm lib/Mail/SpamAssassin/Reporter.pm lib/Mail/SpamAssassin/SQLBasedAddrList.pm diff --git a/lib/Mail/SpamAssassin/Plugin/Pyzor.pm b/lib/Mail/SpamAssassin/Plugin/Pyzor.pm index 3efd4b4..e4c9c05 100644 --- a/lib/Mail/SpamAssassin/Plugin/Pyzor.pm +++ b/lib/Mail/SpamAssassin/Plugin/Pyzor.pm @@ -36,17 +36,13 @@ package Mail::SpamAssassin::Plugin::Pyzor; use Mail::SpamAssassin::Plugin; use Mail::SpamAssassin::Logger; -use Mail::SpamAssassin::Timeout; -use Mail::SpamAssassin::Util qw(untaint_var untaint_file_path - proc_status_ok exit_status_str); +use Mail::SpamAssassin::Util qw(untaint_var); + use strict; use warnings; # use bytes; use re 'taint'; -use Storable; -use POSIX qw(PIPE_BUF WNOHANG _exit); - our @ISA = qw(Mail::SpamAssassin::Plugin); sub new { @@ -78,7 +74,7 @@ sub set_config { my ($self, $conf) = @_; my @cmds; -=head1 USER OPTIONS +=head1 ADMINISTRATOR OPTIONS =over 4 @@ -95,22 +91,7 @@ Whether to use Pyzor, if it is available. type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL }); -=item pyzor_fork (0|1) (default: 0) - -Instead of running Pyzor synchronously, fork separate process for it and -read the results in later (similar to async DNS lookups). Increases -throughput. Experimental. - -=cut - - push(@cmds, { - setting => 'pyzor_fork', - is_admin => 1, - default => 0, - type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC, - }); - -=item pyzor_count_min NUMBER (default: 5) +=item pyzor_count_min NUMBER (default: 5) This option sets how often a message's body checksum must have been reported to the Pyzor server before SpamAssassin will consider the Pyzor @@ -128,54 +109,8 @@ set this to a relatively low value, e.g. C<5>. type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC }); - # Deprecated setting, the name makes no sense! - push (@cmds, { - setting => 'pyzor_max', - is_admin => 1, - type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC, - code => sub { - my ($self, $key, $value, $line) = @_; - warn("deprecated setting used, change pyzor_max to pyzor_count_min\n"); - if ($value !~ /^\d+$/) { - return $Mail::SpamAssassin::Conf::INVALID_VALUE; - } - $self->{pyzor_count_min} = $value; - } - }); - -=item pyzor_whitelist_min NUMBER (default: 10) - -This option sets how often a message's body checksum must have been -whitelisted to the Pyzor server for SpamAssassin to consider ignoring the -result. Final decision is made by pyzor_whitelist_factor. - -=cut - - push (@cmds, { - setting => 'pyzor_whitelist_min', - is_admin => 1, - default => 10, - type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC - }); - -=item pyzor_whitelist_factor NUMBER (default: 0.2) - -Ignore Pyzor result if REPORTCOUNT x NUMBER >= pyzor_whitelist_min. -For default setting this means: 50 reports requires 10 whitelistings. - -=cut - - push (@cmds, { - setting => 'pyzor_whitelist_factor', - is_admin => 1, - default => 0.2, - type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC - }); - =back -=head1 ADMINISTRATOR OPTIONS - =over 4 =item pyzor_timeout n (default: 5) @@ -210,478 +145,182 @@ removing one of them. type => $Mail::SpamAssassin::Conf::CONF_TYPE_DURATION }); -=item pyzor_options options +=item pyzor_whitelist_min NUMBER (default: 10) -Specify additional options to the pyzor(1) command. Please note that only -characters in the range [0-9A-Za-z =,._/-] are allowed for security reasons. +This option sets how often a message's body checksum must have been +whitelisted to the Pyzor server for SpamAssassin to consider ignoring the +result. Final decision is made by pyzor_whitelist_factor. =cut push (@cmds, { - setting => 'pyzor_options', + setting => 'pyzor_whitelist_min', is_admin => 1, - default => '', - type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING, - code => sub { - my ($self, $key, $value, $line) = @_; - if ($value !~ m{^([0-9A-Za-z =,._/-]+)$}) { - return $Mail::SpamAssassin::Conf::INVALID_VALUE; - } - $self->{pyzor_options} = $1; - } + default => 10, + type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC }); -=item pyzor_path STRING +=item pyzor_whitelist_factor NUMBER (default: 0.2) -This option tells SpamAssassin specifically where to find the C<pyzor> -client instead of relying on SpamAssassin to find it in the current -PATH. Note that if I<taint mode> is enabled in the Perl interpreter, -you should use this, as the current PATH will have been cleared. +Ignore Pyzor result if REPORTCOUNT x NUMBER >= pyzor_whitelist_min. +For default setting this means: 50 reports requires 10 whitelistings. =cut push (@cmds, { - setting => 'pyzor_path', + setting => 'pyzor_whitelist_factor', is_admin => 1, - default => undef, - type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING, - code => sub { - my ($self, $key, $value, $line) = @_; - if (!defined $value || !length $value) { - return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE; - } - $value = untaint_file_path($value); - if (!-x $value) { - info("config: pyzor_path \"$value\" isn't an executable"); - return $Mail::SpamAssassin::Conf::INVALID_VALUE; - } - - $self->{pyzor_path} = $value; - } + default => 0.2, + type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC }); $conf->{parser}->register_commands(\@cmds); } sub is_pyzor_available { - my ($self) = @_; + my ($self) = @_; - my $pyzor = $self->{main}->{conf}->{pyzor_path} || - Mail::SpamAssassin::Util::find_executable_in_env_path('pyzor'); - - unless ($pyzor && -x $pyzor) { - dbg("pyzor: no pyzor executable found"); - $self->{pyzor_available} = 0; - return 0; - } - - # remember any found pyzor - $self->{main}->{conf}->{pyzor_path} = $pyzor; - - dbg("pyzor: pyzor is available: $pyzor"); - return 1; + local $@; + eval { + require Mail::SpamAssassin::Pyzor::Digest; + require Mail::SpamAssassin::Pyzor::Client; + }; + return $@ ? 0 : 1; } -sub finish_parsing_start { - my ($self, $opts) = @_; +sub get_pyzor_interface { + my ($self) = @_; - # If forking, hard adjust priority -100 to launch early - # Find rulenames from eval_to_rule mappings - if ($opts->{conf}->{pyzor_fork}) { - foreach (@{$opts->{conf}->{eval_to_rule}->{check_pyzor}}) { - dbg("pyzor: adjusting rule $_ priority to -100"); - $opts->{conf}->{priority}->{$_} = -100; - } + if (!$self->{main}->{conf}->{use_pyzor}) { + dbg("pyzor: use_pyzor option not enabled, disabling Pyzor"); + $self->{pyzor_interface} = "disabled"; + $self->{pyzor_available} = 0; + } + elsif ($self->is_pyzor_available()) { + $self->{pyzor_interface} = "pyzor"; + $self->{pyzor_available} = 1; + } + else { + dbg("pyzor: no pyzor found, disabling Pyzor"); + $self->{pyzor_available} = 0; } } sub check_pyzor { - my ($self, $pms, $full) = @_; - - return 0 if !$self->{pyzor_available}; - return 0 if !$self->{main}->{conf}->{use_pyzor}; - - return 0 if $pms->{pyzor_running}; - $pms->{pyzor_running} = 1; - - return 0 if !$self->is_pyzor_available(); - - my $timer = $self->{main}->time_method("check_pyzor"); + my ($self, $permsgstatus, $full) = @_; # initialize valid tags - $pms->{tag_data}->{PYZOR} = ''; - - # create fulltext tmpfile now (before possible forking) - $pms->{pyzor_tmpfile} = $pms->create_fulltext_tmpfile(); - - ## non-forking method - - if (!$self->{main}->{conf}->{pyzor_fork}) { - my @results = $self->pyzor_lookup($pms); - return $self->_check_result($pms, \@results); - } - - ## forking method - - $pms->{pyzor_rulename} = $pms->get_current_eval_rule_name(); - $pms->rule_pending($pms->{pyzor_rulename}); # mark async - - # create socketpair for communication - $pms->{pyzor_backchannel} = Mail::SpamAssassin::SubProcBackChannel->new(); - my $back_selector = ''; - $pms->{pyzor_backchannel}->set_selector(\$back_selector); - eval { - $pms->{pyzor_backchannel}->setup_backchannel_parent_pre_fork(); - } or do { - dbg("pyzor: backchannel pre-setup failed: $@"); - delete $pms->{pyzor_backchannel}; - return 0; - }; + $permsgstatus->{tag_data}->{PYZOR} = ""; - my $pid = fork(); - if (!defined $pid) { - info("pyzor: child fork failed: $!"); - delete $pms->{pyzor_backchannel}; - return 0; - } - if (!$pid) { - $0 = "$0 (pyzor)"; - $SIG{CHLD} = 'DEFAULT'; - $SIG{PIPE} = 'IGNORE'; - $SIG{$_} = sub { - eval { dbg("pyzor: child process $$ caught signal $_[0]"); }; - _exit(6); # avoid END and destructor processing - kill('KILL',$$); # still kicking? die! - } foreach qw(INT HUP TERM TSTP QUIT USR1 USR2); - dbg("pyzor: child process $$ forked"); - $pms->{pyzor_backchannel}->setup_backchannel_child_post_fork(); - my @results = $self->pyzor_lookup($pms); - my $backmsg; - eval { - $backmsg = Storable::freeze(\@results); - }; - if ($@) { - dbg("pyzor: child return value freeze failed: $@"); - _exit(0); # avoid END and destructor processing - } - if (!syswrite($pms->{pyzor_backchannel}->{parent}, $backmsg)) { - dbg("pyzor: child backchannel write failed: $!"); - } - _exit(0); # avoid END and destructor processing - } - - $pms->{pyzor_pid} = $pid; + my $timer = $self->{main}->time_method("check_pyzor"); - eval { - $pms->{pyzor_backchannel}->setup_backchannel_parent_post_fork($pid); - } or do { - dbg("pyzor: backchannel post-setup failed: $@"); - delete $pms->{pyzor_backchannel}; - return 0; - }; + $self->get_pyzor_interface(); + return 0 unless $self->{pyzor_available}; - return 0; + return $self->pyzor_lookup($permsgstatus, $full); } sub pyzor_lookup { - my ($self, $pms) = @_; - - my $conf = $self->{main}->{conf}; - my $timeout = $conf->{pyzor_timeout}; - - # note: not really tainted, this came from system configuration file - my $path = untaint_file_path($conf->{pyzor_path}); - my $opts = untaint_var($conf->{pyzor_options}) || ''; - - $pms->enter_helper_run_mode(); - - my $pid; - my @resp; - my $timer = Mail::SpamAssassin::Timeout->new( - { secs => $timeout, deadline => $pms->{master_deadline} }); - my $err = $timer->run_and_catch(sub { - local $SIG{PIPE} = sub { die "__brokenpipe__ignore__\n" }; - - dbg("pyzor: opening pipe: ". - join(' ', $path, $opts, "check", "<".$pms->{pyzor_tmpfile})); - - $pid = Mail::SpamAssassin::Util::helper_app_pipe_open(*PYZOR, - $pms->{pyzor_tmpfile}, 1, $path, split(' ', $opts), "check"); - $pid or die "$!\n"; - - # read+split avoids a Perl I/O bug (Bug 5985) - my($inbuf, $nread); - my $resp = ''; - while ($nread = read(PYZOR, $inbuf, 8192)) { $resp .= $inbuf } - defined $nread or die "error reading from pipe: $!"; - @resp = split(/^/m, $resp, -1); - - my $errno = 0; - close PYZOR or $errno = $!; - if (proc_status_ok($?, $errno)) { - dbg("pyzor: [%s] finished successfully", $pid); - } elsif (proc_status_ok($?, $errno, 0, 1)) { # sometimes it exits with 1 - dbg("pyzor: [%s] finished: %s", $pid, exit_status_str($?, $errno)); - } else { - info("pyzor: [%s] error: %s", $pid, exit_status_str($?, $errno)); - } - - }); - - if (defined(fileno(*PYZOR))) { # still open - if ($pid) { - if (kill('TERM', $pid)) { - dbg("pyzor: killed stale helper [$pid]"); - } else { - dbg("pyzor: killing helper application [$pid] failed: $!"); - } - } - my $errno = 0; - close PYZOR or $errno = $!; - proc_status_ok($?, $errno) - or info("pyzor: [%s] error: %s", $pid, exit_status_str($?, $errno)); - } - - $pms->leave_helper_run_mode(); - - if ($timer->timed_out()) { - dbg("pyzor: check timed out after $timeout seconds"); - return (); - } elsif ($err) { - chomp $err; - info("pyzor: check failed: $err"); - return (); - } - - return @resp; -} - -sub check_tick { - my ($self, $opts) = @_; - $self->_check_forked_result($opts->{permsgstatus}, 0); -} - -sub check_cleanup { - my ($self, $opts) = @_; - $self->_check_forked_result($opts->{permsgstatus}, 1); -} - -sub _check_forked_result { - my ($self, $pms, $finish) = @_; - - return 0 if !$pms->{pyzor_backchannel}; - return 0 if !$pms->{pyzor_pid}; + my ( $self, $permsgstatus, $fulltext ) = @_; + my $conf = $self->{main}->{conf}; + my $timeout = $conf->{pyzor_timeout}; + + my $client = ( $self->{'_pyzor_client'} ||= Mail::SpamAssassin::Pyzor::Client->new( 'timeout' => $timeout ) ); + my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $fulltext ); + + local $@; + my $ref = eval { $client->check($digest); }; + dbg("pyzor: got response: $client->{'_server_host'}"); + # $client reply must be an hash + return 0 if (not (ref $ref eq ref {})); + if ($@) { + my $err = $@; - my $timer = $self->{main}->time_method("check_pyzor"); + $err = eval { $err->get_message() } || $err; - $pms->{pyzor_abort} = $pms->{deadline_exceeded} || $pms->{shortcircuited}; - - my $kid_pid = $pms->{pyzor_pid}; - # if $finish, force waiting for the child - my $pid = waitpid($kid_pid, $finish && !$pms->{pyzor_abort} ? 0 : WNOHANG); - if ($pid == 0) { - #dbg("pyzor: child process $kid_pid not finished yet, trying later"); - if ($pms->{pyzor_abort}) { - dbg("pyzor: bailing out due to deadline/shortcircuit"); - kill('TERM', $kid_pid); - if (waitpid($kid_pid, WNOHANG) == 0) { - sleep(1); - if (waitpid($kid_pid, WNOHANG) == 0) { - dbg("pyzor: child process $kid_pid still alive, KILL"); - kill('KILL', $kid_pid); - waitpid($kid_pid, 0); + warn("pyzor: check failed: $err\n"); + return 0; + } elsif ( defined $ref->{'Code'} and $ref->{'Code'} ne 200 ) { + if(defined $ref->{'Code'} and defined $ref->{'Diag'}) { + dbg("pyzor: check failed with invalid code: $ref->{'Code'}: $ref->{'Diag'}"); + } else { + dbg("pyzor: check failed with undefined code"); } - } - delete $pms->{pyzor_pid}; - delete $pms->{pyzor_backchannel}; + return 0; } - return 0; - } elsif ($pid == -1) { - # child does not exist? - dbg("pyzor: child process $kid_pid already handled?"); - delete $pms->{pyzor_backchannel}; - return 0; - } - $pms->rule_ready($pms->{pyzor_rulename}); # mark rule ready for metas + my $pyzor_count = untaint_var($ref->{'Count'}) + 0; + my $pyzor_whitelisted = untaint_var($ref->{'WL-Count'}) + 0; + my $count_min = $conf->{pyzor_count_min}; + my $wl_min = $conf->{pyzor_whitelist_min}; - dbg("pyzor: child process $kid_pid finished, reading results"); + my $wl_limit = $pyzor_whitelisted >= $wl_min ? + $pyzor_count * $conf->{pyzor_whitelist_factor} : 0; - my $backmsg; - my $ret = sysread($pms->{pyzor_backchannel}->{latest_kid_fh}, $backmsg, PIPE_BUF); - if (!defined $ret || $ret == 0) { - dbg("pyzor: could not read result from child: ".($ret == 0 ? 0 : $!)); - delete $pms->{pyzor_backchannel}; - return 0; - } - - delete $pms->{pyzor_backchannel}; + $permsgstatus->set_tag('PYZOR', "Reported $pyzor_count times, whitelisted $pyzor_whitelisted times."); - my $results; - eval { - $results = Storable::thaw($backmsg); - }; - if ($@) { - dbg("pyzor: child return value thaw failed: $@"); - return; - } - - $self->_check_result($pms, $results); -} + dbg("pyzor: result: COUNT=$pyzor_count/$count_min WHITELIST=$pyzor_whitelisted/$wl_min/%.1f", + $wl_limit); -sub _check_result { - my ($self, $pms, $results) = @_; - - if (!@$results) { - dbg("pyzor: no response from server"); - return 0; - } - - my $count = 0; - my $count_wl = 0; - foreach my $res (@$results) { - chomp($res); - if ($res =~ /^Traceback/) { - info("pyzor: internal error, python traceback seen in response: $res"); + # Empty body etc results in same hash, we should skip very large numbers.. + if ($pyzor_count >= 1000000 || $pyzor_whitelisted >= 10000) { + dbg("pyzor: result exceeded hardcoded limits, ignoring: count/wl 1000000/10000"); return 0; } - dbg("pyzor: got response: $res"); - # this regexp is intended to be a little bit forgiving - if ($res =~ /^\S+\t.*?\t(\d+)\t(\d+)\s*$/) { - # until pyzor servers can sync their DBs, - # sum counts obtained from all servers - $count += untaint_var($1)+0; # crazy but needs untainting - $count_wl += untaint_var($2)+0; - } else { - # warn on failures to parse - info("pyzor: failure to parse response \"$res\""); - } - } - - my $conf = $self->{main}->{conf}; - - my $count_min = $conf->{pyzor_count_min}; - my $wl_min = $conf->{pyzor_whitelist_min}; - my $wl_limit = $count_wl >= $wl_min ? - $count * $conf->{pyzor_whitelist_factor} : 0; - - dbg("pyzor: result: COUNT=$count/$count_min WHITELIST=$count_wl/$wl_min/%.1f", - $wl_limit); - $pms->set_tag('PYZOR', "Reported $count times, whitelisted $count_wl times."); - - # Empty body etc results in same hash, we should skip very large numbers.. - if ($count >= 1000000 || $count_wl >= 10000) { - dbg("pyzor: result exceeded hardcoded limits, ignoring: count/wl 1000000/10000"); - return 0; - } - - # Whitelisted? - if ($wl_limit && $count_wl >= $wl_limit) { - dbg("pyzor: message whitelisted"); - return 0; - } + # Whitelisted? + if ($wl_limit && $pyzor_whitelisted >= $wl_limit) { + dbg("pyzor: message whitelisted"); + return 0; + } - if ($count >= $count_min) { - if ($conf->{pyzor_fork}) { - # forked needs to run got_hit() - $pms->got_hit($pms->{pyzor_rulename}, "", ruletype => 'eval'); + if ( $pyzor_count >= $count_min ) { + return 1; } - return 1; - } - return 0; + return 0; } sub plugin_report { my ($self, $options) = @_; - return if !$self->{pyzor_available}; - return if !$self->{main}->{conf}->{use_pyzor}; - return if $options->{report}->{options}->{dont_report_to_pyzor}; - return if !$self->is_pyzor_available(); - - # use temporary file: open2() is unreliable due to buffering under spamd - my $tmpf = $options->{report}->create_fulltext_tmpfile($options->{text}); - if ($self->pyzor_report($options, $tmpf)) { - $options->{report}->{report_available} = 1; - info("reporter: spam reported to Pyzor"); - $options->{report}->{report_return} = 1; - } - else { - info("reporter: could not report spam to Pyzor"); - } - $options->{report}->delete_fulltext_tmpfile($tmpf); + return unless $self->{pyzor_available}; + return unless $self->{main}->{conf}->{use_pyzor}; - return 1; + if (!$options->{report}->{options}->{dont_report_to_pyzor} && $self->is_pyzor_available()) + { + if ($self->pyzor_report($options)) { + $options->{report}->{report_available} = 1; + info("reporter: spam reported to Pyzor"); + $options->{report}->{report_return} = 1; + } + else { + info("reporter: could not report spam to Pyzor"); + } + } } sub pyzor_report { - my ($self, $options, $tmpf) = @_; - - # note: not really tainted, this came from system configuration file - my $path = untaint_file_path($options->{report}->{conf}->{pyzor_path}); - my $opts = untaint_var($options->{report}->{conf}->{pyzor_options}) || ''; + my ( $self, $options ) = @_; - my $timeout = $self->{main}->{conf}->{pyzor_timeout}; + my $timeout = $self->{main}->{conf}->{pyzor_timeout}; - $options->{report}->enter_helper_run_mode(); + my $client = ( $self->{'_pyzor_client'} ||= Mail::SpamAssassin::Pyzor::Client->new( 'timeout' => $timeout ) ); - my $timer = Mail::SpamAssassin::Timeout->new({ secs => $timeout }); - my $err = $timer->run_and_catch(sub { + my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $options->{'text'} ); - local $SIG{PIPE} = sub { die "__brokenpipe__ignore__\n" }; - - dbg("pyzor: opening pipe: " . join(' ', $path, $opts, "report", "< $tmpf")); - - my $pid = Mail::SpamAssassin::Util::helper_app_pipe_open(*PYZOR, - $tmpf, 1, $path, split(' ', $opts), "report"); - $pid or die "$!\n"; - - my($inbuf,$nread,$nread_all); $nread_all = 0; - # response is ignored, just check its existence - while ( $nread=read(PYZOR,$inbuf,8192) ) { $nread_all += $nread } - defined $nread or die "error reading from pipe: $!"; - - dbg("pyzor: empty response") if $nread_all < 1; - - my $errno = 0; close PYZOR or $errno = $!; - # closing a pipe also waits for the process executing on the pipe to - # complete, no need to explicitly call waitpid - # my $child_stat = waitpid($pid,0) > 0 ? $? : undef; - if (proc_status_ok($?,$errno, 0)) { - dbg("pyzor: [%s] reporter finished successfully", $pid); - } else { - info("pyzor: [%s] reporter error: %s", $pid, exit_status_str($?,$errno)); + local $@; + my $ref = eval { $client->report($digest); }; + if ($@) { + warn("pyzor: report failed: $@"); + return 0; } - - }); - - $options->{report}->leave_helper_run_mode(); - - if ($timer->timed_out()) { - dbg("reporter: pyzor report timed out after $timeout seconds"); - return 0; - } - - if ($err) { - chomp $err; - if ($err eq '__brokenpipe__ignore__') { - dbg("reporter: pyzor report failed: broken pipe"); - } else { - warn("reporter: pyzor report failed: $err\n"); + elsif ( $ref->{'Code'} ne 200 ) { + dbg("pyzor: report failed with invalid code: $ref->{'Code'}: $ref->{'Diag'}"); + return 0; } - return 0; - } - return 1; + return 1; } -# Version features -sub has_fork { 1 } - 1; - -=back - -=cut diff --git a/lib/Mail/SpamAssassin/Pyzor.pm b/lib/Mail/SpamAssassin/Pyzor.pm new file mode 100644 index 0000000..8ac27f4 --- /dev/null +++ b/lib/Mail/SpamAssassin/Pyzor.pm @@ -0,0 +1,56 @@ +package Mail::SpamAssassin::Pyzor; + +# Copyright 2018 cPanel, LLC. +# All rights reserved. +# http://cpanel.net +# +# <@LICENSE> +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to you under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# </@LICENSE> +# + +use strict; +use warnings; + +our $VERSION = '0.06_01'; + +=encoding utf-8 + +=head1 NAME + +Mail::SpamAssassin::Pyzor - Pyzor spam filtering in Perl + +=head1 DESCRIPTION + +This distribution contains Perl implementations of parts of +L<Pyzor|http://pyzor.org>, a tool for use in spam email filtering. +It is intended for use with L<Mail::SpamAssassin> but may be useful +in other contexts. + +See the following modules for information on specific tools that +the distribution includes: + +=over + +=item * L<Mail::SpamAssassin::Pyzor::Client> + +=item * L<Mail::SpamAssassin::Pyzor::Digest> + +=back + +=cut + +1; diff --git a/lib/Mail/SpamAssassin/Pyzor/Client.pm b/lib/Mail/SpamAssassin/Pyzor/Client.pm new file mode 100644 index 0000000..ccff868 --- /dev/null +++ b/lib/Mail/SpamAssassin/Pyzor/Client.pm @@ -0,0 +1,415 @@ +package Mail::SpamAssassin::Pyzor::Client; + +# Copyright 2018 cPanel, LLC. +# All rights reserved. +# http://cpanel.net +# +# <@LICENSE> +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to you under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# </@LICENSE> +# + +use strict; +use warnings; + +=encoding utf-8 + +=head1 NAME + +Mail::SpamAssassin::Pyzor::Client - Pyzor client logic + +=head1 SYNOPSIS + + use Mail::SpamAssassin::Pyzor::Client (); + use Mail::SpamAssassin::Pyzor::Digest (); + + my $client = Mail::SpamAssassin::Pyzor::Client->new(); + + my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $msg ); + + my $check_ref = $client->check($digest); + die $check_ref->{'Diag'} if $check_ref->{'Code'} ne '200'; + + my $report_ref = $client->report($digest); + die $report_ref->{'Diag'} if $report_ref->{'Code'} ne '200'; + +=head1 DESCRIPTION + +A bare-bones L<Pyzor|http://pyzor.org> client that currently only +implements the functionality needed for L<Mail::SpamAssassin>. + +=head1 PROTOCOL DETAILS + +The Pyzor protocol is not a published standard, and there appears to be +no meaningful public documentation. What follows is enough information, +largely gleaned through forum posts and reverse engineering, to facilitate +effective use of this module: + +Pyzor is an RPC-oriented, message-based protocol. Each message +is a simple dictionary of 7-bit ASCII keys and values. Server responses +always include at least the following: + +=over + +=item * C<Code> - Similar to HTTP status codes; anything besides C<200> +is an error. + +=item * C<Diag> - Similar to HTTP status reasons: a text description +of the status. + +=back + +(NB: There are additional standard response headers that are useful only for +the protocol itself and thus are not part of this module’s returns.) + +=head2 Reliability + +Pyzor uses UDP rather than TCP, so no message is guaranteed to reach its +destination. A transmission failure can happen in either the request or +the response; in either case, a timeout error will result. Such errors +are represented as thrown instances of L<Mail::Pyzor::X::Timeout>. + +=cut + +#---------------------------------------------------------------------- + +our $VERSION = '0.04'; + +our $DEFAULT_SERVER_HOST = 'public.pyzor.org'; +our $DEFAULT_SERVER_PORT = 24441; +our $DEFAULT_USERNAME = 'anonymous'; +our $DEFAULT_PASSWORD = ''; +our $DEFAULT_OP_SPEC = '20,3,60,3'; +our $PYZOR_PROTOCOL_VERSION = 2.1; +our $DEFAULT_TIMEOUT = 3.5; +our $READ_SIZE = 8192; + +use IO::Socket::INET (); +use Digest::SHA qw(sha1 sha1_hex); + +my @hash_order = ( 'Op', 'Op-Digest', 'Op-Spec', 'Thread', 'PV', 'User', 'Time', 'Sig' ); + +#---------------------------------------------------------------------- + +=head1 CONSTRUCTOR + +=head2 new(%OPTS) + +Create a new pyzor client. + +=over 2 + +=item Input + +%OPTS are (all optional): + +=over 3 + +=item * C<server_host> - The pyzor server host to connect to (default is +C<public.pyzor.org>) + +=item * C<server_port> - The pyzor server port to connect to (default is +24441) + +=item * C<username> - The username to present to the pyzor server (default +is C<anonymous>) + +=item * C<password> - The password to present to the pyzor server (default +is empty) + +=item * C<timeout> - The maximum time, in seconds, to wait for a response +from the pyzor server (defeault is 3.5) + +=back + +=item Output + +=over 3 + +Returns a L<Mail::SpamAssassin::Pyzor::Client> object. + +=back + +=back + +=cut + +sub new { + my ( $class, %OPTS ) = @_; + + return bless { + '_server_host' => $OPTS{'server_host'} || $DEFAULT_SERVER_HOST, + '_server_port' => $OPTS{'server_port'} || $DEFAULT_SERVER_PORT, + '_username' => $OPTS{'username'} || $DEFAULT_USERNAME, + '_password' => $OPTS{'password'} || $DEFAULT_PASSWORD, + '_op_spec' => $DEFAULT_OP_SPEC, + '_timeout' => $OPTS{'timeout'} || $DEFAULT_TIMEOUT, + }, $class; +} + +#---------------------------------------------------------------------- + +=head1 REQUEST METHODS + +=head2 report($digest) + +Report the digest of a spam message to the pyzor server. This function +will throw if a messaging failure or timeout happens. + +=over 2 + +=item Input + +=over 3 + +=item $digest C<SCALAR> + +The message digest to report, as given by +C<Mail::SpamAssassin::Pyzor::Digest::get()>. + +=back + +=item Output + +=over 3 + +=item C<HASHREF> + +Returns a hashref of the standard attributes noted above. + +=back + +=back + +=cut + +sub report { + my ( $self, $digest ) = @_; + + my $msg_ref = $self->_get_base_msg( 'report', $digest ); + + $msg_ref->{'Op-Spec'} = $self->{'_op_spec'}; + + return $self->_send_receive_msg($msg_ref); +} + +=head2 check($digest) + +Check the digest of a message to see if +the pyzor server has a report for it. This function +will throw if a messaging failure or timeout happens. + +=over 2 + +=item Input + +=over 3 + +=item $digest C<SCALAR> + +The message digest to check, as given by +C<Mail::SpamAssassin::Pyzor::Digest::get()>. + +=back + +=item Output + +=over 3 + +=item C<HASHREF> + +Returns a hashref of the standard attributes noted above +as well as the following: + +=over + +=item * C<Count> - The number of reports the server has received +for the given digest. + +=item * C<WL-Count> - The number of whitelist requests the server has received +for the given digest. + +=back + +=back + +=back + +=cut + +sub check { + my ( $self, $digest ) = @_; + + return $self->_send_receive_msg( $self->_get_base_msg( 'check', $digest ) ); +} + +# ---------------------------------------- + +sub _send_receive_msg { + my ( $self, $msg_ref ) = @_; + + my $thread_id = $msg_ref->{'Thread'} or warn 'No thread ID?'; + + $self->_sign_msg($msg_ref); + + return $self->_do_send_receive( + $self->_generate_packet_from_message($msg_ref) . "\n\n", + $thread_id, + ); +} + +sub _get_base_msg { + my ( $self, $op, $digest ) = @_; + + die "Implementor error: op is required" if !$op; + die "error: digest is required" if !$digest; + + return { + 'User' => $self->{'_username'}, + 'PV' => $PYZOR_PROTOCOL_VERSION, + 'Time' => time(), + 'Op' => $op, + 'Op-Digest' => $digest, + 'Thread' => $self->_generate_thread_id() + }; +} + +sub _do_send_receive { + my ( $self, $packet, $thread_id ) = @_; + + my $sock = $self->_get_connection_or_die(); + + $self->_send_packet( $sock, $packet ); + my $response = $self->_receive_packet( $sock, $thread_id ); + + return 0 if not defined $response; + + my $resp_hr = { map { ( split(m{: }) )[ 0, 1 ] } split( m{\n}, $response ) }; + + delete $resp_hr->{'Thread'}; + + my $response_pv = delete $resp_hr->{'PV'}; + + if ( $PYZOR_PROTOCOL_VERSION ne $response_pv ) { + warn "Unexpected protocol version ($response_pv) in Pyzor response!"; + } + + return $resp_hr; +} + +sub _receive_packet { + my ( $self, $sock, $thread_id ) = @_; + + my $timeout = $self->{'_timeout'} * 1000; + + my $end_time = time + $self->{'_timeout'}; + + $sock->blocking(0); + my $response = ''; + my $rout = ''; + my $rin = ''; + vec( $rin, fileno($sock), 1 ) = 1; + + while (1) { + my $time_left = $end_time - time; + + if ( $time_left <= 0 ) { + warn("Did not receive a response from the pyzor server $self->{'_server_host'}:$self->{'_server_port'} for $self->{'_timeout'} seconds!"); + return; + } + + my $bytes = sysread( $sock, $response, $READ_SIZE, length $response ); + if ( !defined($bytes) && !$!{'EAGAIN'} && !$!{'EWOULDBLOCK'} ) { + warn "read from socket: $!"; + } + + if ( index( $response, "\n\n" ) > -1 ) { + + # Reject the response unless its thread ID matches what we sent. + # This prevents confusion among concurrent Pyzor reqeusts. + if ( index( $response, "\nThread: $thread_id\n" ) != -1 ) { + last; + } + else { + $response = ''; + } + } + + my $found = select( $rout = $rin, undef, undef, $time_left ); + warn "select(): $!" if $found == -1; + } + + return $response; +} + +sub _send_packet { + my ( $self, $sock, $packet ) = @_; + + $sock->blocking(1); + syswrite( $sock, $packet ) or warn "write to socket: $!"; + + return; +} + +sub _get_connection_or_die { + my ($self) = @_; + + # clear the socket if the PID changes + if ( defined $self->{'_sock_pid'} && $self->{'_sock_pid'} != $$ ) { + undef $self->{'_sock_pid'}; + undef $self->{'_sock'}; + } + + $self->{'_sock_pid'} ||= $$; + $self->{'_sock'} ||= IO::Socket::INET->new( + 'PeerHost' => $self->{'_server_host'}, + 'PeerPort' => $self->{'_server_port'}, + 'Proto' => 'udp' + ) or die "Cannot connect to $self->{'_server_host'}:$self->{'_server_port'}: $@ $!"; + + return $self->{'_sock'}; +} + +sub _sign_msg { + my ( $self, $msg_ref ) = @_; + + $msg_ref->{'Sig'} = lc Digest::SHA::sha1_hex( + Digest::SHA::sha1( $self->_generate_packet_from_message($msg_ref) ) + ); + + return 1; +} + +sub _generate_packet_from_message { + my ( $self, $msg_ref ) = @_; + + return join( "\n", map { "$_: $msg_ref->{$_}" } grep { length $msg_ref->{$_} } @hash_order ); +} + +sub _generate_thread_id { + my $RAND_MAX = 2**16; + my $val = 0; + $val = int rand($RAND_MAX) while $val < 1024; + return $val; +} + +sub _get_user_pass_hash_key { + my ($self) = @_; + + return lc Digest::SHA::sha1_hex( $self->{'_username'} . ':' . $self->{'_password'} ); +} + +1; diff --git a/lib/Mail/SpamAssassin/Pyzor/Digest.pm b/lib/Mail/SpamAssassin/Pyzor/Digest.pm new file mode 100644 index 0000000..0e8a5ae --- /dev/null +++ b/lib/Mail/SpamAssassin/Pyzor/Digest.pm @@ -0,0 +1,103 @@ +package Mail::SpamAssassin::Pyzor::Digest; + +# Copyright 2018 cPanel, LLC. +# All rights reserved. +# http://cpanel.net +# +# <@LICENSE> +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to you under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# </@LICENSE> +# + +use strict; +use warnings; + +=encoding utf-8 + +=head1 NAME + +Mail::SpamAssassin::Pyzor::Digest + +=head1 SYNOPSIS + + my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $mime_text ); + +=head1 DESCRIPTION + +A reimplementation of L<https://github.com/SpamExperts/pyzor/blob/master/pyzor/digest.py>. + +=cut + +#---------------------------------------------------------------------- + +use Email::MIME (); + +use Mail::SpamAssassin::Pyzor::Digest::Pieces (); +use Digest::SHA qw(sha1_hex); + +our $VERSION = '0.03'; + +#---------------------------------------------------------------------- + +=head1 FUNCTIONS + +=head2 $hex = get( $MSG ) + +This takes an email message in raw MIME text format (i.e., as saved in the +standard mbox format) and returns the message’s Pyzor digest in lower-case +hexadecimal. + +The output from this function should normally be identical to that of +the C<pyzor> script’s C<digest> command. It is suitable for use in +L<Mail::SpamAssassin::Pyzor::Client>’s request methods. + +=cut + +sub get { + my ($text) = @_; + return Digest::SHA::sha1_hex( ${ _get_predigest( $text ) } ); +} + +# NB: This is called from the test. +sub _get_predigest { ## no critic qw(RequireArgUnpacking) + my ($msg_text_sr) = @_; + + my $parsed = Email::MIME->new($$msg_text_sr); + + my @lines; + + my $payloads_ar = Mail::SpamAssassin::Pyzor::Digest::Pieces::digest_payloads($parsed); + + for my $payload (@$payloads_ar) { + my @p_lines = Mail::SpamAssassin::Pyzor::Digest::Pieces::splitlines($payload); + for my $line (@p_lines) { + Mail::SpamAssassin::Pyzor::Digest::Pieces::normalize($line); + + next if !Mail::SpamAssassin::Pyzor::Digest::Pieces::should_handle_line($line); + + # Make sure we have an octet string. + utf8::encode($line) if utf8::is_utf8($line); + + push @lines, $line; + } + } + + my $digest_sr = Mail::SpamAssassin::Pyzor::Digest::Pieces::assemble_lines( \@lines ); + + return $digest_sr; +} + +1; diff --git a/lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm b/lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm new file mode 100644 index 0000000..522accd --- /dev/null +++ b/lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm @@ -0,0 +1,301 @@ +package Mail::SpamAssassin::Pyzor::Digest::Pieces; + +# Copyright 2018 cPanel, LLC. +# All rights reserved. +# http://cpanel.net +# +# <@LICENSE> +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to you under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# </@LICENSE> +# + +use strict; +use warnings; + +=encoding utf-8 + +=head1 NAME + +Mail::SpamAssassin::Pyzor::Digest::Pieces + +=head1 DESCRIPTION + +This module houses backend logic for L<Mail::SpamAssassin::Pyzor::Digest>. + +It reimplements logic found in pyzor’s F<digest.py> module +(L<https://github.com/SpamExperts/pyzor/blob/master/pyzor/digest.py>). + +=cut + +#---------------------------------------------------------------------- + +use Email::MIME::ContentType (); +use Encode (); + +our $VERSION = '0.03'; + +# each tuple is [ offset, length ] +use constant _HASH_SPEC => ( [ 20, 3 ], [ 60, 3 ] ); + +use constant { + _MIN_LINE_LENGTH => 8, + + _ATOMIC_NUM_LINES => 4, +}; + +#---------------------------------------------------------------------- + +=head1 FUNCTIONS + +=head2 $strings_ar = digest_payloads( $EMAIL_MIME ) + +This imitates the corresponding object method in F<digest.py>. +It returns a reference to an array of strings. Each string can be either +a byte string or a character string (e.g., UTF-8 decoded). + +NB: RFC 2822 stipulates that message bodies should use CRLF +line breaks, not plain LF (nor plain CR). L<Email::MIME::Encodings> +will thus convert any plain CRs in a quoted-printable message +body into CRLF. Python, though, doesn’t do this, so the output of +our implementation of C<digest_payloads()> diverges from that of the Python +original. It doesn’t ultimately make a difference since the line-ending +whitespace gets trimmed regardless, but it’s necessary to factor in when +comparing the output of our implementation with the Python output. + +=cut + +sub digest_payloads { + my ($parsed) = @_; + + my @subparts = $parsed->subparts(); + + my @payloads; + + if (@subparts) { + @payloads = map { @{ digest_payloads($_) } } $parsed->subparts(); + } + else { + my ( $main_type, $subtype, $encoding, $encode_check ) = parse_content_type( $parsed->content_type() ); + + my $payload; + + if ( $main_type eq 'text' ) { + + # Decode transfer encoding, but leave us as a byte string. + # Note that this is where Email::MIME converts plain LF to CRLF. + $payload = $parsed->body(); + + # This does the actual character decoding (i.e., “charset”). + $payload = Encode::decode( $encoding, $payload, $encode_check ); + + if ( $subtype eq 'html' ) { + require Mail::SpamAssassin::Pyzor::Digest::StripHtml; + $payload = Mail::SpamAssassin::Pyzor::Digest::StripHtml::strip($payload); + } + } + else { + + # This does no decoding, even of, e.g., quoted-printable or base64. + $payload = $parsed->body_raw(); + } + + push @payloads, $payload; + } + + return \@payloads; +} + +#---------------------------------------------------------------------- + +=head2 normalize( $STRING ) + +This imitates the corresponding object method in F<digest.py>. +It modifies C<$STRING> in-place. + +As with the original implementation, if C<$STRING> contains (decoded) +Unicode characters, those characters will be parsed accordingly. So: + + $str = "123\xc2\xa0"; # [ c2 a0 ] == \u00a0, non-breaking space + + normalize($str); + +The above will leave C<$str> alone, but this: + + utf8::decode($str); + + normalize($str); + +… will trim off the last two bytes from C<$str>. + +=cut + +sub normalize { ## no critic qw( Subroutines::RequireArgUnpacking ) + + # NULs are bad, mm-kay? + $_[0] =~ tr<\0><>d; + + # NB: Python’s \s without re.UNICODE is the same as Perl’s \s + # with the /a modifier. + # + # https://docs.python.org/2/library/re.html + # https://perldoc.perl.org/perlrecharclass.html#Backslash-sequences + + # Python: re.compile(r'\S{10,}') + $_[0] =~ s<\S{10,}><>ag; + + # Python: re.compile(r'\S+@\S+') + $_[0] =~ s<\S+ @ \S+><>agx; + + # Python: re.compile(r'[a-z]+:\S+', re.IGNORECASE) + $_[0] =~ s<[a-zA-Z]+ : \S+><>agx; + + # (from digest.py …) + # Make sure we do the whitespace last because some of the previous + # patterns rely on whitespace. + $_[0] =~ tr< \x09-\x0d><>d; + + # This is fun. digest.py’s normalize() does a non-UNICODE whitespace + # strip, then calls strip() on the string, which *will* strip Unicode + # whitespace from the ends. + $_[0] =~ s<\A\s+><>; + $_[0] =~ s<\s+\z><>; + + return; +} + +#---------------------------------------------------------------------- + +=head2 $yn = should_handle_line( $STRING ) + +This imitates the corresponding object method in F<digest.py>. +It returns a boolean. + +=cut + +sub should_handle_line { + return $_[0] && length( $_[0] ) >= _MIN_LINE_LENGTH(); +} + +#---------------------------------------------------------------------- + +=head2 $sr = assemble_lines( \@LINES ) + +This assembles a string buffer out of @LINES. The string is the buffer +of octets that will be hashed to produce the message digest. + +Each member of @LINES is expected to be an B<octet string>, not a +character string. + +=cut + +sub assemble_lines { + my ($lines_ar) = @_; + + if ( @$lines_ar <= _ATOMIC_NUM_LINES() ) { + + # cf. handle_atomic() in digest.py + return \join( q<>, @$lines_ar ); + } + + #---------------------------------------------------------------------- + # cf. handle_atomic() in digest.py + + my $str = q<>; + + for my $ofs_len ( _HASH_SPEC() ) { + my ( $offset, $length ) = @$ofs_len; + + for my $i ( 0 .. ( $length - 1 ) ) { + my $idx = int( $offset * @$lines_ar / 100 ) + $i; + + next if !defined $lines_ar->[$idx]; + + $str .= $lines_ar->[$idx]; + } + } + + return \$str; +} + +#---------------------------------------------------------------------- + +=head2 ($main, $sub, $encoding, $checkval) = parse_content_type( $CONTENT_TYPE ) + +=cut + +use constant _QUOTED_PRINTABLE_NAMES => ( + "quopri-codec", + "quopri", + "quoted-printable", + "quotedprintable", +); + +# Make Encode::decode() ignore anything that doesn’t fit the +# given encoding. +use constant _encode_check_ignore => q<>; + +sub parse_content_type { + my ($content_type) = @_; + + $Email::MIME::ContentType::STRICT_PARAMS = 0; + my $ct_parse = Email::MIME::ContentType::parse_content_type( + $content_type, + ); + + my $main = $ct_parse->{'type'} || q<>; + my $sub = $ct_parse->{'subtype'} || q<>; + + my $encoding = $ct_parse->{'attributes'}{'charset'}; + + my $checkval; + + if ($encoding) { + + # Lower-case everything, convert underscore to dash, and remove NUL. + $encoding =~ tr<A-Z_\0><a-z->d; + + # Apparently pyzor accommodates messages that put the transfer + # encoding in the Content-Type. + if ( grep { $_ eq $encoding } _QUOTED_PRINTABLE_NAMES() ) { + $checkval = Encode::FB_CROAK(); + } + } + else { + $encoding = 'ascii'; + } + + # Match Python .decode()’s 'ignore' behavior + $checkval ||= \&_encode_check_ignore; + + return ( $main, $sub, $encoding, $checkval ); +} + +#---------------------------------------------------------------------- + +=head2 @lines = splitlines( $TEXT ) + +Imitates C<str.splitlines()>. (cf. C<pydoc str>) + +Returns a plain list in list context. Returns the number of +items to be returned in scalar context. + +=cut + +sub splitlines { + return split m<\r\n?|\n>, $_[0]; +} + +1; diff --git a/lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm b/lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm new file mode 100644 index 0000000..2617b4a --- /dev/null +++ b/lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm @@ -0,0 +1,177 @@ +package Mail::SpamAssassin::Pyzor::Digest::StripHtml; + +# Copyright 2018 cPanel, LLC. +# All rights reserved. +# http://cpanel.net +# +# <@LICENSE> +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to you under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# </@LICENSE> +# + +use strict; +use warnings; + +=encoding utf-8 + +=head1 NAME + +Mail::SpamAssassin::Pyzor::Digest::StripHtml + +=head1 SYNOPSIS + + my $stripped = Mail::SpamAssassin::Pyzor::Digest::StripHtml::strip($html); + +=head1 DESCRIPTION + +This module attempts to duplicate pyzor’s HTML-stripping logic. + +=head1 ACCURACY + +This library cannot achieve 100%, bug-for-bug parity with pyzor +because to do so would require duplicating Python’s own HTML parsing +library. Since that library’s output has changed over time, and those +changes in turn affect pyzor, it’s literally impossible to arrive at +a single, fully-compatible reimplementation. + +That said, all known divergences between pyzor and this library involve +invalid HTML as input. + +Please open bug reports for any divergences you identify, particularly +if the input is valid HTML. + +=cut + +#---------------------------------------------------------------------- + +use HTML::Parser (); + +our $VERSION = '0.03'; + +#---------------------------------------------------------------------- + +=head1 FUNCTIONS + +=head2 $stripped = strip( $HTML ) + +Give it some HTML, and it’ll give back the stripped text. + +In B<general>, the stripping consists of removing tags as well as +C<E<lt>scriptE<gt>> and C<E<lt>styleE<gt>> elements; however, it also +removes HTML entities. + +This tries very hard to duplicate pyzor’s behavior with invalid HTML. + +=cut + +sub strip { + my ($html) = @_; + + $html =~ s<\A\s+><>; + $html =~ s<\s+\z><>; + + my $p = HTML::Parser->new( api_version => 3 ); + + my @pieces; + + my $accumulate = 1; + + $p->handler( + start => sub { + my ($tagname) = @_; + + $accumulate = 0 if $tagname eq 'script'; + $accumulate = 0 if $tagname eq 'style'; + + return; + }, + 'tagname', + ); + + $p->handler( + end => sub { + $accumulate = 1; + return; + } + ); + + $p->handler( + text => sub { + my ($copy) = @_; + + return if !$accumulate; + + # pyzor’s HTML parser discards HTML entities. On top of that, + # we need to match, as closely as possible, pyzor’s handling of + # invalid HTML entities … which is a function of Python’s + # standard HTML parsing library. This will probably never be + # fully compatible with the pyzor, but we can get it close. + + # The original is: + # + # re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') + # + # The parsing loop then “backs up” one byte if the last + # character isn’t a “;”. We use a look-ahead assertion to + # mimic that behavior. + $copy =~ s<\&\# (?:[0-9]+ | [xX][0-9a-fA-F]+) (?: ; | \z | (?=[^0-9a-fA-F]) )>< >gx; + + # The original is: + # + # re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') + # + # We again use a look-ahead assertion to mimic Python. + $copy =~ s<\& [a-zA-Z] [-.a-zA-Z0-9]* (?: ; | \z | (?=[^a-zA-Z0-9]) )>< >gx; + + # Python’s HTMLParser aborts its parsing loop when it encounters + # an invalid numeric reference. + $copy =~ s<\&\# + (?: + [^0-9xX] # anything but the expected first char + | + [0-9]+[a-fA-F] # hex within decimal + | + [xX][^0-9a-fA-F] + ) + (.*) + >< + ( -1 == index($1, ';') ) ? q<> : '&#' + >exs; + + # Python’s HTMLParser treats invalid entities as incomplete + $copy =~ s<(\&\#?)><$1 >gx; + + $copy =~ s<\A\s+><>; + $copy =~ s<\s+\z><>; + + push @pieces, \$copy if length $copy; + }, + 'text,tagname', + ); + + $p->parse($html); + $p->eof(); + + my $payload = join( q< >, map { $$_ } @pieces ); + + # Convert all sequences of whitespace OTHER THAN non-breaking spaces to + # plain spaces. + $payload =~ s<[^\S\x{a0}]+>< >g; + + return $payload; +} + +1; diff --git a/t/pyzor.t b/t/pyzor.t index 891f38d..e4ef83f 100755 --- a/t/pyzor.t +++ b/t/pyzor.t @@ -3,12 +3,9 @@ use lib '.'; use lib 't'; use SATest; sa_t_init("pyzor"); -use constant HAS_PYZOR => eval { $_ = untaint_cmd("which pyzor"); chomp; -x }; - use Test::More; plan skip_all => "Net tests disabled" unless conf_bool('run_net_tests'); -plan skip_all => "Pyzor executable not found in path" unless HAS_PYZOR; -plan tests => 8; +plan tests => 5; diag('Note: Failures may not be an SpamAssassin bug, as Pyzor tests can fail due to problems with the Pyzor servers.'); @@ -30,7 +27,7 @@ tstprefs (" sarun ("-t < data/spam/pyzor", \&patterns_run_cb); ok_all_patterns(); # Same with fork -sarun ("--cf='pyzor_fork 1' -t < data/spam/pyzor", \&patterns_run_cb); +sarun ("-t < data/spam/pyzor", \&patterns_run_cb); ok_all_patterns(); #TESTING FOR HAM @@ -44,7 +41,3 @@ ok_all_patterns(); sarun ("-D pyzor -t < data/nice/001 2>&1", \&patterns_run_cb); ok_all_patterns(); -# same with fork -sarun ("-D pyzor --cf='pyzor_fork 1' -t < data/nice/001 2>&1", \&patterns_run_cb); -ok_all_patterns(); -
signature.asc
Description: PGP signature