Atleast these seem completely unneeded module dependencies.

IO::SigGuard (not even found in Ubuntu packages)
Email::MIME

So the code should be refactored to use SA methods as necessary.


On Sat, Oct 16, 2021 at 11:06:07PM -0400, Kevin A. McGrail wrote:
> No worries there that I know of.
> 
> cPanel has the paperwork for CCLA on file and several people with ICLA's as
> well.  They've given us permission to commit the code too.
> 
> I think it will be better than any dependency on external binaries.
> 
> Regards,
> 
> KAM
> 
> On 10/14/2021 10:37 AM, Henrik K wrote:
> > If that's the case, I probably wouldn't have any objections.  Not sure if it
> > requires some Contributor License Agreement from cPanels part (maybe they
> > already have one), and I guess atleast a bug to make it official..  Sidney
> > or KAM can probably chime in on the admin side..
> > 
> > 
> > On Thu, Oct 14, 2021 at 04:32:53PM +0200, Giovanni Bechis wrote:
> > > Once committed, code will be no more developed by cPanel on CPAN
> > > and original code will be removed.
> > > 
> > > I can work to integrate old and new Pyzor versions.
> > > 
> > >   Giovanni
> > > 
> > > On Thu, Oct 14, 2021 at 05:27:16PM +0300, Henrik K wrote:
> > > > If it's developed by cPanel in CPAN, then it should not be committed to 
> > > > SA,
> > > > unless it's clearly donated to SpamAssassin and removed from CPAN.  
> > > > Assuming
> > > > we have developer resources and will to take it aboard.
> > > > 
> > > > As it is, Plugin/Pyzor.pm should have an option to choose which one to 
> > > > use,
> > > > as it makes no sense to ditch support for the widely installed original
> > > > Pyzor.
> > > > 
> > > > 
> > > > On Thu, Oct 14, 2021 at 04:15:13PM +0200, Giovanni Bechis wrote:
> > > > > Hi,
> > > > > cPanel has developed a native Perl Pyzor implementation for 
> > > > > SpamAssassin
> > > > > and a diff against SpamAssassin 4.0 follows.
> > > > > Atm I am using it in production on a small server, more tests and
> > > > > opinions are welcome.
> > > > > 
> > > > > Original cPanel code is at https://metacpan.org/pod/Mail::Pyzor.
> > > > > 
> > > > >   Cheers
> > > > >    Giovanni
> > > > > 
> > > > > diff --git a/MANIFEST b/MANIFEST
> > > > > index 25d0192..2d9588c 100644
> > > > > --- a/MANIFEST
> > > > > +++ b/MANIFEST
> > > > > @@ -126,6 +126,11 @@ lib/Mail/SpamAssassin/Plugin/WLBLEval.pm
> > > > >   lib/Mail/SpamAssassin/Plugin/WhiteListSubject.pm
> > > > >   lib/Mail/SpamAssassin/PluginHandler.pm
> > > > >   lib/Mail/SpamAssassin/Plugin/URILocalBL.pm
> > > > > +lib/Mail/SpamAssassin/Pyzor/Client.pm
> > > > > +lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm
> > > > > +lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm
> > > > > +lib/Mail/SpamAssassin/Pyzor/Digest.pm
> > > > > +lib/Mail/SpamAssassin/Pyzor.pm
> > > > >   lib/Mail/SpamAssassin/RegistryBoundaries.pm
> > > > >   lib/Mail/SpamAssassin/Reporter.pm
> > > > >   lib/Mail/SpamAssassin/SQLBasedAddrList.pm
> > > > > diff --git a/lib/Mail/SpamAssassin/Plugin/Pyzor.pm 
> > > > > b/lib/Mail/SpamAssassin/Plugin/Pyzor.pm
> > > > > index 3efd4b4..e4c9c05 100644
> > > > > --- a/lib/Mail/SpamAssassin/Plugin/Pyzor.pm
> > > > > +++ b/lib/Mail/SpamAssassin/Plugin/Pyzor.pm
> > > > > @@ -36,17 +36,13 @@ package Mail::SpamAssassin::Plugin::Pyzor;
> > > > >   use Mail::SpamAssassin::Plugin;
> > > > >   use Mail::SpamAssassin::Logger;
> > > > > -use Mail::SpamAssassin::Timeout;
> > > > > -use Mail::SpamAssassin::Util qw(untaint_var untaint_file_path
> > > > > -                                proc_status_ok exit_status_str);
> > > > > +use Mail::SpamAssassin::Util qw(untaint_var);
> > > > > +
> > > > >   use strict;
> > > > >   use warnings;
> > > > >   # use bytes;
> > > > >   use re 'taint';
> > > > > -use Storable;
> > > > > -use POSIX qw(PIPE_BUF WNOHANG _exit);
> > > > > -
> > > > >   our @ISA = qw(Mail::SpamAssassin::Plugin);
> > > > >   sub new {
> > > > > @@ -78,7 +74,7 @@ sub set_config {
> > > > >     my ($self, $conf) = @_;
> > > > >     my @cmds;
> > > > > -=head1 USER OPTIONS
> > > > > +=head1 ADMINISTRATOR OPTIONS
> > > > >   =over 4
> > > > > @@ -95,22 +91,7 @@ Whether to use Pyzor, if it is available.
> > > > >       type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL
> > > > >     });
> > > > > -=item pyzor_fork (0|1)               (default: 0)
> > > > > -
> > > > > -Instead of running Pyzor synchronously, fork separate process for it 
> > > > > and
> > > > > -read the results in later (similar to async DNS lookups).  Increases
> > > > > -throughput.  Experimental.
> > > > > -
> > > > > -=cut
> > > > > -
> > > > > -  push(@cmds, {
> > > > > -    setting => 'pyzor_fork',
> > > > > -    is_admin => 1,
> > > > > -    default => 0,
> > > > > -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
> > > > > -  });
> > > > > -
> > > > > -=item pyzor_count_min NUMBER (default: 5)
> > > > > +=item pyzor_count_min NUMBER         (default: 5)
> > > > >   This option sets how often a message's body checksum must have been
> > > > >   reported to the Pyzor server before SpamAssassin will consider the 
> > > > > Pyzor
> > > > > @@ -128,54 +109,8 @@ set this to a relatively low value, e.g. C<5>.
> > > > >       type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
> > > > >     });
> > > > > -  # Deprecated setting, the name makes no sense!
> > > > > -  push (@cmds, {
> > > > > -    setting => 'pyzor_max',
> > > > > -    is_admin => 1,
> > > > > -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
> > > > > -    code => sub {
> > > > > -      my ($self, $key, $value, $line) = @_;
> > > > > -      warn("deprecated setting used, change pyzor_max to 
> > > > > pyzor_count_min\n");
> > > > > -      if ($value !~ /^\d+$/) {
> > > > > -        return $Mail::SpamAssassin::Conf::INVALID_VALUE;
> > > > > -      }
> > > > > -      $self->{pyzor_count_min} = $value;
> > > > > -    }
> > > > > -  });
> > > > > -
> > > > > -=item pyzor_whitelist_min NUMBER     (default: 10)
> > > > > -
> > > > > -This option sets how often a message's body checksum must have been
> > > > > -whitelisted to the Pyzor server for SpamAssassin to consider 
> > > > > ignoring the
> > > > > -result.  Final decision is made by pyzor_whitelist_factor.
> > > > > -
> > > > > -=cut
> > > > > -
> > > > > -  push (@cmds, {
> > > > > -    setting => 'pyzor_whitelist_min',
> > > > > -    is_admin => 1,
> > > > > -    default => 10,
> > > > > -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
> > > > > -  });
> > > > > -
> > > > > -=item pyzor_whitelist_factor NUMBER  (default: 0.2)
> > > > > -
> > > > > -Ignore Pyzor result if REPORTCOUNT x NUMBER >= pyzor_whitelist_min.
> > > > > -For default setting this means: 50 reports requires 10 whitelistings.
> > > > > -
> > > > > -=cut
> > > > > -
> > > > > -  push (@cmds, {
> > > > > -    setting => 'pyzor_whitelist_factor',
> > > > > -    is_admin => 1,
> > > > > -    default => 0.2,
> > > > > -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
> > > > > -  });
> > > > > -
> > > > >   =back
> > > > > -=head1 ADMINISTRATOR OPTIONS
> > > > > -
> > > > >   =over 4
> > > > >   =item pyzor_timeout n               (default: 5)
> > > > > @@ -210,478 +145,182 @@ removing one of them.
> > > > >       type => $Mail::SpamAssassin::Conf::CONF_TYPE_DURATION
> > > > >     });
> > > > > -=item pyzor_options options
> > > > > +=item pyzor_whitelist_min NUMBER        (default: 10)
> > > > > -Specify additional options to the pyzor(1) command. Please note that 
> > > > > only
> > > > > -characters in the range [0-9A-Za-z =,._/-] are allowed for security 
> > > > > reasons.
> > > > > +This option sets how often a message's body checksum must have been
> > > > > +whitelisted to the Pyzor server for SpamAssassin to consider 
> > > > > ignoring the
> > > > > +result.  Final decision is made by pyzor_whitelist_factor.
> > > > >   =cut
> > > > >     push (@cmds, {
> > > > > -    setting => 'pyzor_options',
> > > > > +    setting => 'pyzor_whitelist_min',
> > > > >       is_admin => 1,
> > > > > -    default => '',
> > > > > -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING,
> > > > > -    code => sub {
> > > > > -      my ($self, $key, $value, $line) = @_;
> > > > > -      if ($value !~ m{^([0-9A-Za-z =,._/-]+)$}) {
> > > > > -     return $Mail::SpamAssassin::Conf::INVALID_VALUE;
> > > > > -      }
> > > > > -      $self->{pyzor_options} = $1;
> > > > > -    }
> > > > > +    default => 10,
> > > > > +    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
> > > > >     });
> > > > > -=item pyzor_path STRING
> > > > > +=item pyzor_whitelist_factor NUMBER     (default: 0.2)
> > > > > -This option tells SpamAssassin specifically where to find the 
> > > > > C<pyzor>
> > > > > -client instead of relying on SpamAssassin to find it in the current
> > > > > -PATH.  Note that if I<taint mode> is enabled in the Perl interpreter,
> > > > > -you should use this, as the current PATH will have been cleared.
> > > > > +Ignore Pyzor result if REPORTCOUNT x NUMBER >= pyzor_whitelist_min.
> > > > > +For default setting this means: 50 reports requires 10 whitelistings.
> > > > >   =cut
> > > > >     push (@cmds, {
> > > > > -    setting => 'pyzor_path',
> > > > > +    setting => 'pyzor_whitelist_factor',
> > > > >       is_admin => 1,
> > > > > -    default => undef,
> > > > > -    type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING,
> > > > > -    code => sub {
> > > > > -      my ($self, $key, $value, $line) = @_;
> > > > > -      if (!defined $value || !length $value) {
> > > > > -     return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE;
> > > > > -      }
> > > > > -      $value = untaint_file_path($value);
> > > > > -      if (!-x $value) {
> > > > > -     info("config: pyzor_path \"$value\" isn't an executable");
> > > > > -     return $Mail::SpamAssassin::Conf::INVALID_VALUE;
> > > > > -      }
> > > > > -
> > > > > -      $self->{pyzor_path} = $value;
> > > > > -    }
> > > > > +    default => 0.2,
> > > > > +    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
> > > > >     });
> > > > >     $conf->{parser}->register_commands(\@cmds);
> > > > >   }
> > > > >   sub is_pyzor_available {
> > > > > -  my ($self) = @_;
> > > > > +    my ($self) = @_;
> > > > > -  my $pyzor = $self->{main}->{conf}->{pyzor_path} ||
> > > > > -    Mail::SpamAssassin::Util::find_executable_in_env_path('pyzor');
> > > > > -
> > > > > -  unless ($pyzor && -x $pyzor) {
> > > > > -    dbg("pyzor: no pyzor executable found");
> > > > > -    $self->{pyzor_available} = 0;
> > > > > -    return 0;
> > > > > -  }
> > > > > -
> > > > > -  # remember any found pyzor
> > > > > -  $self->{main}->{conf}->{pyzor_path} = $pyzor;
> > > > > -
> > > > > -  dbg("pyzor: pyzor is available: $pyzor");
> > > > > -  return 1;
> > > > > +    local $@;
> > > > > +    eval {
> > > > > +        require Mail::SpamAssassin::Pyzor::Digest;
> > > > > +        require Mail::SpamAssassin::Pyzor::Client;
> > > > > +    };
> > > > > +    return $@ ? 0 : 1;
> > > > >   }
> > > > > -sub finish_parsing_start {
> > > > > -  my ($self, $opts) = @_;
> > > > > +sub get_pyzor_interface {
> > > > > +  my ($self) = @_;
> > > > > -  # If forking, hard adjust priority -100 to launch early
> > > > > -  # Find rulenames from eval_to_rule mappings
> > > > > -  if ($opts->{conf}->{pyzor_fork}) {
> > > > > -    foreach (@{$opts->{conf}->{eval_to_rule}->{check_pyzor}}) {
> > > > > -      dbg("pyzor: adjusting rule $_ priority to -100");
> > > > > -      $opts->{conf}->{priority}->{$_} = -100;
> > > > > -    }
> > > > > +  if (!$self->{main}->{conf}->{use_pyzor}) {
> > > > > +    dbg("pyzor: use_pyzor option not enabled, disabling Pyzor");
> > > > > +    $self->{pyzor_interface} = "disabled";
> > > > > +    $self->{pyzor_available} = 0;
> > > > > +  }
> > > > > +  elsif ($self->is_pyzor_available()) {
> > > > > +    $self->{pyzor_interface} = "pyzor";
> > > > > +    $self->{pyzor_available} = 1;
> > > > > +  }
> > > > > +  else {
> > > > > +    dbg("pyzor: no pyzor found, disabling Pyzor");
> > > > > +    $self->{pyzor_available} = 0;
> > > > >     }
> > > > >   }
> > > > >   sub check_pyzor {
> > > > > -  my ($self, $pms, $full) = @_;
> > > > > -
> > > > > -  return 0 if !$self->{pyzor_available};
> > > > > -  return 0 if !$self->{main}->{conf}->{use_pyzor};
> > > > > -
> > > > > -  return 0 if $pms->{pyzor_running};
> > > > > -  $pms->{pyzor_running} = 1;
> > > > > -
> > > > > -  return 0 if !$self->is_pyzor_available();
> > > > > -
> > > > > -  my $timer = $self->{main}->time_method("check_pyzor");
> > > > > +  my ($self, $permsgstatus, $full) = @_;
> > > > >     # initialize valid tags
> > > > > -  $pms->{tag_data}->{PYZOR} = '';
> > > > > -
> > > > > -  # create fulltext tmpfile now (before possible forking)
> > > > > -  $pms->{pyzor_tmpfile} = $pms->create_fulltext_tmpfile();
> > > > > -
> > > > > -  ## non-forking method
> > > > > -
> > > > > -  if (!$self->{main}->{conf}->{pyzor_fork}) {
> > > > > -    my @results = $self->pyzor_lookup($pms);
> > > > > -    return $self->_check_result($pms, \@results);
> > > > > -  }
> > > > > -
> > > > > -  ## forking method
> > > > > -
> > > > > -  $pms->{pyzor_rulename} = $pms->get_current_eval_rule_name();
> > > > > -  $pms->rule_pending($pms->{pyzor_rulename}); # mark async
> > > > > -
> > > > > -  # create socketpair for communication
> > > > > -  $pms->{pyzor_backchannel} = 
> > > > > Mail::SpamAssassin::SubProcBackChannel->new();
> > > > > -  my $back_selector = '';
> > > > > -  $pms->{pyzor_backchannel}->set_selector(\$back_selector);
> > > > > -  eval {
> > > > > -    $pms->{pyzor_backchannel}->setup_backchannel_parent_pre_fork();
> > > > > -  } or do {
> > > > > -    dbg("pyzor: backchannel pre-setup failed: $@");
> > > > > -    delete $pms->{pyzor_backchannel};
> > > > > -    return 0;
> > > > > -  };
> > > > > +  $permsgstatus->{tag_data}->{PYZOR} = "";
> > > > > -  my $pid = fork();
> > > > > -  if (!defined $pid) {
> > > > > -    info("pyzor: child fork failed: $!");
> > > > > -    delete $pms->{pyzor_backchannel};
> > > > > -    return 0;
> > > > > -  }
> > > > > -  if (!$pid) {
> > > > > -    $0 = "$0 (pyzor)";
> > > > > -    $SIG{CHLD} = 'DEFAULT';
> > > > > -    $SIG{PIPE} = 'IGNORE';
> > > > > -    $SIG{$_} = sub {
> > > > > -      eval { dbg("pyzor: child process $$ caught signal $_[0]"); };
> > > > > -      _exit(6);  # avoid END and destructor processing
> > > > > -      kill('KILL',$$);  # still kicking? die!
> > > > > -      } foreach qw(INT HUP TERM TSTP QUIT USR1 USR2);
> > > > > -    dbg("pyzor: child process $$ forked");
> > > > > -    $pms->{pyzor_backchannel}->setup_backchannel_child_post_fork();
> > > > > -    my @results = $self->pyzor_lookup($pms);
> > > > > -    my $backmsg;
> > > > > -    eval {
> > > > > -      $backmsg = Storable::freeze(\@results);
> > > > > -    };
> > > > > -    if ($@) {
> > > > > -      dbg("pyzor: child return value freeze failed: $@");
> > > > > -      _exit(0); # avoid END and destructor processing
> > > > > -    }
> > > > > -    if (!syswrite($pms->{pyzor_backchannel}->{parent}, $backmsg)) {
> > > > > -      dbg("pyzor: child backchannel write failed: $!");
> > > > > -    }
> > > > > -    _exit(0); # avoid END and destructor processing
> > > > > -  }
> > > > > -
> > > > > -  $pms->{pyzor_pid} = $pid;
> > > > > +  my $timer = $self->{main}->time_method("check_pyzor");
> > > > > -  eval {
> > > > > -    
> > > > > $pms->{pyzor_backchannel}->setup_backchannel_parent_post_fork($pid);
> > > > > -  } or do {
> > > > > -    dbg("pyzor: backchannel post-setup failed: $@");
> > > > > -    delete $pms->{pyzor_backchannel};
> > > > > -    return 0;
> > > > > -  };
> > > > > +  $self->get_pyzor_interface();
> > > > > +  return 0 unless $self->{pyzor_available};
> > > > > -  return 0;
> > > > > +  return $self->pyzor_lookup($permsgstatus, $full);
> > > > >   }
> > > > >   sub pyzor_lookup {
> > > > > -  my ($self, $pms) = @_;
> > > > > -
> > > > > -  my $conf = $self->{main}->{conf};
> > > > > -  my $timeout = $conf->{pyzor_timeout};
> > > > > -
> > > > > -  # note: not really tainted, this came from system configuration 
> > > > > file
> > > > > -  my $path = untaint_file_path($conf->{pyzor_path});
> > > > > -  my $opts = untaint_var($conf->{pyzor_options}) || '';
> > > > > -
> > > > > -  $pms->enter_helper_run_mode();
> > > > > -
> > > > > -  my $pid;
> > > > > -  my @resp;
> > > > > -  my $timer = Mail::SpamAssassin::Timeout->new(
> > > > > -           { secs => $timeout, deadline => $pms->{master_deadline} 
> > > > > });
> > > > > -  my $err = $timer->run_and_catch(sub {
> > > > > -    local $SIG{PIPE} = sub { die "__brokenpipe__ignore__\n" };
> > > > > -
> > > > > -    dbg("pyzor: opening pipe: ".
> > > > > -      join(' ', $path, $opts, "check", "<".$pms->{pyzor_tmpfile}));
> > > > > -
> > > > > -    $pid = Mail::SpamAssassin::Util::helper_app_pipe_open(*PYZOR,
> > > > > -     $pms->{pyzor_tmpfile}, 1, $path, split(' ', $opts), "check");
> > > > > -    $pid or die "$!\n";
> > > > > -
> > > > > -    # read+split avoids a Perl I/O bug (Bug 5985)
> > > > > -    my($inbuf, $nread);
> > > > > -    my $resp = '';
> > > > > -    while ($nread = read(PYZOR, $inbuf, 8192)) { $resp .= $inbuf }
> > > > > -    defined $nread  or die "error reading from pipe: $!";
> > > > > -    @resp = split(/^/m, $resp, -1);
> > > > > -
> > > > > -    my $errno = 0;
> > > > > -    close PYZOR or $errno = $!;
> > > > > -    if (proc_status_ok($?, $errno)) {
> > > > > -      dbg("pyzor: [%s] finished successfully", $pid);
> > > > > -    } elsif (proc_status_ok($?, $errno, 0, 1)) {  # sometimes it 
> > > > > exits with 1
> > > > > -      dbg("pyzor: [%s] finished: %s", $pid, exit_status_str($?, 
> > > > > $errno));
> > > > > -    } else {
> > > > > -      info("pyzor: [%s] error: %s", $pid, exit_status_str($?, 
> > > > > $errno));
> > > > > -    }
> > > > > -
> > > > > -  });
> > > > > -
> > > > > -  if (defined(fileno(*PYZOR))) {  # still open
> > > > > -    if ($pid) {
> > > > > -      if (kill('TERM', $pid)) {
> > > > > -        dbg("pyzor: killed stale helper [$pid]");
> > > > > -      } else {
> > > > > -        dbg("pyzor: killing helper application [$pid] failed: $!");
> > > > > -      }
> > > > > -    }
> > > > > -    my $errno = 0;
> > > > > -    close PYZOR or $errno = $!;
> > > > > -    proc_status_ok($?, $errno)
> > > > > -      or info("pyzor: [%s] error: %s", $pid, exit_status_str($?, 
> > > > > $errno));
> > > > > -  }
> > > > > -
> > > > > -  $pms->leave_helper_run_mode();
> > > > > -
> > > > > -  if ($timer->timed_out()) {
> > > > > -    dbg("pyzor: check timed out after $timeout seconds");
> > > > > -    return ();
> > > > > -  } elsif ($err) {
> > > > > -    chomp $err;
> > > > > -    info("pyzor: check failed: $err");
> > > > > -    return ();
> > > > > -  }
> > > > > -
> > > > > -  return @resp;
> > > > > -}
> > > > > -
> > > > > -sub check_tick {
> > > > > -  my ($self, $opts) = @_;
> > > > > -  $self->_check_forked_result($opts->{permsgstatus}, 0);
> > > > > -}
> > > > > -
> > > > > -sub check_cleanup {
> > > > > -  my ($self, $opts) = @_;
> > > > > -  $self->_check_forked_result($opts->{permsgstatus}, 1);
> > > > > -}
> > > > > -
> > > > > -sub _check_forked_result {
> > > > > -  my ($self, $pms, $finish) = @_;
> > > > > -
> > > > > -  return 0 if !$pms->{pyzor_backchannel};
> > > > > -  return 0 if !$pms->{pyzor_pid};
> > > > > +    my ( $self, $permsgstatus, $fulltext ) = @_;
> > > > > +    my $conf = $self->{main}->{conf};
> > > > > +    my $timeout = $conf->{pyzor_timeout};
> > > > > +
> > > > > +    my $client = ( $self->{'_pyzor_client'} ||= 
> > > > > Mail::SpamAssassin::Pyzor::Client->new( 'timeout' => $timeout ) );
> > > > > +    my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $fulltext );
> > > > > +
> > > > > +    local $@;
> > > > > +    my $ref = eval { $client->check($digest); };
> > > > > +    dbg("pyzor: got response: $client->{'_server_host'}");
> > > > > +    # $client reply must be an hash
> > > > > +    return 0 if (not (ref $ref eq ref {}));
> > > > > +    if ($@) {
> > > > > +        my $err = $@;
> > > > > -  my $timer = $self->{main}->time_method("check_pyzor");
> > > > > +        $err = eval { $err->get_message() } || $err;
> > > > > -  $pms->{pyzor_abort} = $pms->{deadline_exceeded} || 
> > > > > $pms->{shortcircuited};
> > > > > -
> > > > > -  my $kid_pid = $pms->{pyzor_pid};
> > > > > -  # if $finish, force waiting for the child
> > > > > -  my $pid = waitpid($kid_pid, $finish && !$pms->{pyzor_abort} ? 0 : 
> > > > > WNOHANG);
> > > > > -  if ($pid == 0) {
> > > > > -    #dbg("pyzor: child process $kid_pid not finished yet, trying 
> > > > > later");
> > > > > -    if ($pms->{pyzor_abort}) {
> > > > > -      dbg("pyzor: bailing out due to deadline/shortcircuit");
> > > > > -      kill('TERM', $kid_pid);
> > > > > -      if (waitpid($kid_pid, WNOHANG) == 0) {
> > > > > -        sleep(1);
> > > > > -        if (waitpid($kid_pid, WNOHANG) == 0) {
> > > > > -          dbg("pyzor: child process $kid_pid still alive, KILL");
> > > > > -          kill('KILL', $kid_pid);
> > > > > -          waitpid($kid_pid, 0);
> > > > > +        warn("pyzor: check failed: $err\n");
> > > > > +        return 0;
> > > > > +    } elsif ( defined $ref->{'Code'} and $ref->{'Code'} ne 200 ) {
> > > > > +        if(defined $ref->{'Code'} and defined $ref->{'Diag'}) {
> > > > > +          dbg("pyzor: check failed with invalid code: 
> > > > > $ref->{'Code'}: $ref->{'Diag'}");
> > > > > +        } else {
> > > > > +          dbg("pyzor: check failed with undefined code");
> > > > >           }
> > > > > -      }
> > > > > -      delete $pms->{pyzor_pid};
> > > > > -      delete $pms->{pyzor_backchannel};
> > > > > +        return 0;
> > > > >       }
> > > > > -    return 0;
> > > > > -  } elsif ($pid == -1) {
> > > > > -    # child does not exist?
> > > > > -    dbg("pyzor: child process $kid_pid already handled?");
> > > > > -    delete $pms->{pyzor_backchannel};
> > > > > -    return 0;
> > > > > -  }
> > > > > -  $pms->rule_ready($pms->{pyzor_rulename}); # mark rule ready for 
> > > > > metas
> > > > > +    my $pyzor_count       = untaint_var($ref->{'Count'}) + 0;
> > > > > +    my $pyzor_whitelisted = untaint_var($ref->{'WL-Count'}) + 0;
> > > > > +    my $count_min = $conf->{pyzor_count_min};
> > > > > +    my $wl_min = $conf->{pyzor_whitelist_min};
> > > > > -  dbg("pyzor: child process $kid_pid finished, reading results");
> > > > > +    my $wl_limit = $pyzor_whitelisted >= $wl_min ?
> > > > > +      $pyzor_count * $conf->{pyzor_whitelist_factor} : 0;
> > > > > -  my $backmsg;
> > > > > -  my $ret = sysread($pms->{pyzor_backchannel}->{latest_kid_fh}, 
> > > > > $backmsg, PIPE_BUF);
> > > > > -  if (!defined $ret || $ret == 0) {
> > > > > -    dbg("pyzor: could not read result from child: ".($ret == 0 ? 0 : 
> > > > > $!));
> > > > > -    delete $pms->{pyzor_backchannel};
> > > > > -    return 0;
> > > > > -  }
> > > > > -
> > > > > -  delete $pms->{pyzor_backchannel};
> > > > > +    $permsgstatus->set_tag('PYZOR', "Reported $pyzor_count times, 
> > > > > whitelisted $pyzor_whitelisted times.");
> > > > > -  my $results;
> > > > > -  eval {
> > > > > -    $results = Storable::thaw($backmsg);
> > > > > -  };
> > > > > -  if ($@) {
> > > > > -    dbg("pyzor: child return value thaw failed: $@");
> > > > > -    return;
> > > > > -  }
> > > > > -
> > > > > -  $self->_check_result($pms, $results);
> > > > > -}
> > > > > +    dbg("pyzor: result: COUNT=$pyzor_count/$count_min 
> > > > > WHITELIST=$pyzor_whitelisted/$wl_min/%.1f",
> > > > > +      $wl_limit);
> > > > > -sub _check_result {
> > > > > -  my ($self, $pms, $results) = @_;
> > > > > -
> > > > > -  if (!@$results) {
> > > > > -    dbg("pyzor: no response from server");
> > > > > -    return 0;
> > > > > -  }
> > > > > -
> > > > > -  my $count = 0;
> > > > > -  my $count_wl = 0;
> > > > > -  foreach my $res (@$results) {
> > > > > -    chomp($res);
> > > > > -    if ($res =~ /^Traceback/) {
> > > > > -      info("pyzor: internal error, python traceback seen in 
> > > > > response: $res");
> > > > > +    # Empty body etc results in same hash, we should skip very large 
> > > > > numbers..
> > > > > +    if ($pyzor_count >= 1000000 || $pyzor_whitelisted >= 10000) {
> > > > > +      dbg("pyzor: result exceeded hardcoded limits, ignoring: 
> > > > > count/wl 1000000/10000");
> > > > >         return 0;
> > > > >       }
> > > > > -    dbg("pyzor: got response: $res");
> > > > > -    # this regexp is intended to be a little bit forgiving
> > > > > -    if ($res =~ /^\S+\t.*?\t(\d+)\t(\d+)\s*$/) {
> > > > > -      # until pyzor servers can sync their DBs,
> > > > > -      # sum counts obtained from all servers
> > > > > -      $count += untaint_var($1)+0; # crazy but needs untainting
> > > > > -      $count_wl += untaint_var($2)+0;
> > > > > -    } else {
> > > > > -      # warn on failures to parse
> > > > > -      info("pyzor: failure to parse response \"$res\"");
> > > > > -    }
> > > > > -  }
> > > > > -
> > > > > -  my $conf = $self->{main}->{conf};
> > > > > -
> > > > > -  my $count_min = $conf->{pyzor_count_min};
> > > > > -  my $wl_min = $conf->{pyzor_whitelist_min};
> > > > > -  my $wl_limit = $count_wl >= $wl_min ?
> > > > > -    $count * $conf->{pyzor_whitelist_factor} : 0;
> > > > > -
> > > > > -  dbg("pyzor: result: COUNT=$count/$count_min 
> > > > > WHITELIST=$count_wl/$wl_min/%.1f",
> > > > > -    $wl_limit);
> > > > > -  $pms->set_tag('PYZOR', "Reported $count times, whitelisted 
> > > > > $count_wl times.");
> > > > > -
> > > > > -  # Empty body etc results in same hash, we should skip very large 
> > > > > numbers..
> > > > > -  if ($count >= 1000000 || $count_wl >= 10000) {
> > > > > -    dbg("pyzor: result exceeded hardcoded limits, ignoring: count/wl 
> > > > > 1000000/10000");
> > > > > -    return 0;
> > > > > -  }
> > > > > -
> > > > > -  # Whitelisted?
> > > > > -  if ($wl_limit && $count_wl >= $wl_limit) {
> > > > > -    dbg("pyzor: message whitelisted");
> > > > > -    return 0;
> > > > > -  }
> > > > > +    # Whitelisted?
> > > > > +    if ($wl_limit && $pyzor_whitelisted >= $wl_limit) {
> > > > > +      dbg("pyzor: message whitelisted");
> > > > > +      return 0;
> > > > > +    }
> > > > > -  if ($count >= $count_min) {
> > > > > -    if ($conf->{pyzor_fork}) {
> > > > > -      # forked needs to run got_hit()
> > > > > -      $pms->got_hit($pms->{pyzor_rulename}, "", ruletype => 'eval');
> > > > > +    if ( $pyzor_count >= $count_min ) {
> > > > > +      return 1;
> > > > >       }
> > > > > -    return 1;
> > > > > -  }
> > > > > -  return 0;
> > > > > +    return 0;
> > > > >   }
> > > > >   sub plugin_report {
> > > > >     my ($self, $options) = @_;
> > > > > -  return if !$self->{pyzor_available};
> > > > > -  return if !$self->{main}->{conf}->{use_pyzor};
> > > > > -  return if $options->{report}->{options}->{dont_report_to_pyzor};
> > > > > -  return if !$self->is_pyzor_available();
> > > > > -
> > > > > -  # use temporary file: open2() is unreliable due to buffering under 
> > > > > spamd
> > > > > -  my $tmpf = 
> > > > > $options->{report}->create_fulltext_tmpfile($options->{text});
> > > > > -  if ($self->pyzor_report($options, $tmpf)) {
> > > > > -    $options->{report}->{report_available} = 1;
> > > > > -    info("reporter: spam reported to Pyzor");
> > > > > -    $options->{report}->{report_return} = 1;
> > > > > -  }
> > > > > -  else {
> > > > > -    info("reporter: could not report spam to Pyzor");
> > > > > -  }
> > > > > -  $options->{report}->delete_fulltext_tmpfile($tmpf);
> > > > > +  return unless $self->{pyzor_available};
> > > > > +  return unless $self->{main}->{conf}->{use_pyzor};
> > > > > -  return 1;
> > > > > +  if (!$options->{report}->{options}->{dont_report_to_pyzor} && 
> > > > > $self->is_pyzor_available())
> > > > > +  {
> > > > > +    if ($self->pyzor_report($options)) {
> > > > > +      $options->{report}->{report_available} = 1;
> > > > > +      info("reporter: spam reported to Pyzor");
> > > > > +      $options->{report}->{report_return} = 1;
> > > > > +    }
> > > > > +    else {
> > > > > +      info("reporter: could not report spam to Pyzor");
> > > > > +    }
> > > > > +  }
> > > > >   }
> > > > >   sub pyzor_report {
> > > > > -  my ($self, $options, $tmpf) = @_;
> > > > > -
> > > > > -  # note: not really tainted, this came from system configuration 
> > > > > file
> > > > > -  my $path = 
> > > > > untaint_file_path($options->{report}->{conf}->{pyzor_path});
> > > > > -  my $opts = 
> > > > > untaint_var($options->{report}->{conf}->{pyzor_options}) || '';
> > > > > +    my ( $self, $options ) = @_;
> > > > > -  my $timeout = $self->{main}->{conf}->{pyzor_timeout};
> > > > > +    my $timeout = $self->{main}->{conf}->{pyzor_timeout};
> > > > > -  $options->{report}->enter_helper_run_mode();
> > > > > +    my $client = ( $self->{'_pyzor_client'} ||= 
> > > > > Mail::SpamAssassin::Pyzor::Client->new( 'timeout' => $timeout ) );
> > > > > -  my $timer = Mail::SpamAssassin::Timeout->new({ secs => $timeout });
> > > > > -  my $err = $timer->run_and_catch(sub {
> > > > > +    my $digest = Mail::SpamAssassin::Pyzor::Digest::get( 
> > > > > $options->{'text'} );
> > > > > -    local $SIG{PIPE} = sub { die "__brokenpipe__ignore__\n" };
> > > > > -
> > > > > -    dbg("pyzor: opening pipe: " . join(' ', $path, $opts, "report", 
> > > > > "< $tmpf"));
> > > > > -
> > > > > -    my $pid = Mail::SpamAssassin::Util::helper_app_pipe_open(*PYZOR,
> > > > > -     $tmpf, 1, $path, split(' ', $opts), "report");
> > > > > -    $pid or die "$!\n";
> > > > > -
> > > > > -    my($inbuf,$nread,$nread_all); $nread_all = 0;
> > > > > -    # response is ignored, just check its existence
> > > > > -    while ( $nread=read(PYZOR,$inbuf,8192) ) { $nread_all += $nread }
> > > > > -    defined $nread  or die "error reading from pipe: $!";
> > > > > -
> > > > > -    dbg("pyzor: empty response")  if $nread_all < 1;
> > > > > -
> > > > > -    my $errno = 0;  close PYZOR or $errno = $!;
> > > > > -    # closing a pipe also waits for the process executing on the 
> > > > > pipe to
> > > > > -    # complete, no need to explicitly call waitpid
> > > > > -    # my $child_stat = waitpid($pid,0) > 0 ? $? : undef;
> > > > > -    if (proc_status_ok($?,$errno, 0)) {
> > > > > -      dbg("pyzor: [%s] reporter finished successfully", $pid);
> > > > > -    } else {
> > > > > -      info("pyzor: [%s] reporter error: %s", $pid, 
> > > > > exit_status_str($?,$errno));
> > > > > +    local $@;
> > > > > +    my $ref = eval { $client->report($digest); };
> > > > > +    if ($@) {
> > > > > +        warn("pyzor: report failed: $@");
> > > > > +        return 0;
> > > > >       }
> > > > > -
> > > > > -  });
> > > > > -
> > > > > -  $options->{report}->leave_helper_run_mode();
> > > > > -
> > > > > -  if ($timer->timed_out()) {
> > > > > -    dbg("reporter: pyzor report timed out after $timeout seconds");
> > > > > -    return 0;
> > > > > -  }
> > > > > -
> > > > > -  if ($err) {
> > > > > -    chomp $err;
> > > > > -    if ($err eq '__brokenpipe__ignore__') {
> > > > > -      dbg("reporter: pyzor report failed: broken pipe");
> > > > > -    } else {
> > > > > -      warn("reporter: pyzor report failed: $err\n");
> > > > > +    elsif ( $ref->{'Code'} ne 200 ) {
> > > > > +        dbg("pyzor: report failed with invalid code: $ref->{'Code'}: 
> > > > > $ref->{'Diag'}");
> > > > > +        return 0;
> > > > >       }
> > > > > -    return 0;
> > > > > -  }
> > > > > -  return 1;
> > > > > +    return 1;
> > > > >   }
> > > > > -# Version features
> > > > > -sub has_fork { 1 }
> > > > > -
> > > > >   1;
> > > > > -
> > > > > -=back
> > > > > -
> > > > > -=cut
> > > > > diff --git a/lib/Mail/SpamAssassin/Pyzor.pm 
> > > > > b/lib/Mail/SpamAssassin/Pyzor.pm
> > > > > new file mode 100644
> > > > > index 0000000..8ac27f4
> > > > > --- /dev/null
> > > > > +++ b/lib/Mail/SpamAssassin/Pyzor.pm
> > > > > @@ -0,0 +1,56 @@
> > > > > +package Mail::SpamAssassin::Pyzor;
> > > > > +
> > > > > +# Copyright 2018 cPanel, LLC.
> > > > > +# All rights reserved.
> > > > > +# http://cpanel.net
> > > > > +#
> > > > > +# <@LICENSE>
> > > > > +# Licensed to the Apache Software Foundation (ASF) under one or more
> > > > > +# contributor license agreements.  See the NOTICE file distributed 
> > > > > with
> > > > > +# this work for additional information regarding copyright ownership.
> > > > > +# The ASF licenses this file to you under the Apache License, 
> > > > > Version 2.0
> > > > > +# (the "License"); you may not use this file except in compliance 
> > > > > with
> > > > > +# the License.  You may obtain a copy of the License at:
> > > > > +#
> > > > > +#     http://www.apache.org/licenses/LICENSE-2.0
> > > > > +#
> > > > > +# Unless required by applicable law or agreed to in writing, software
> > > > > +# distributed under the License is distributed on an "AS IS" BASIS,
> > > > > +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 
> > > > > implied.
> > > > > +# See the License for the specific language governing permissions and
> > > > > +# limitations under the License.
> > > > > +# </@LICENSE>
> > > > > +#
> > > > > +
> > > > > +use strict;
> > > > > +use warnings;
> > > > > +
> > > > > +our $VERSION = '0.06_01';
> > > > > +
> > > > > +=encoding utf-8
> > > > > +
> > > > > +=head1 NAME
> > > > > +
> > > > > +Mail::SpamAssassin::Pyzor - Pyzor spam filtering in Perl
> > > > > +
> > > > > +=head1 DESCRIPTION
> > > > > +
> > > > > +This distribution contains Perl implementations of parts of
> > > > > +L<Pyzor|http://pyzor.org>, a tool for use in spam email filtering.
> > > > > +It is intended for use with L<Mail::SpamAssassin> but may be useful
> > > > > +in other contexts.
> > > > > +
> > > > > +See the following modules for information on specific tools that
> > > > > +the distribution includes:
> > > > > +
> > > > > +=over
> > > > > +
> > > > > +=item * L<Mail::SpamAssassin::Pyzor::Client>
> > > > > +
> > > > > +=item * L<Mail::SpamAssassin::Pyzor::Digest>
> > > > > +
> > > > > +=back
> > > > > +
> > > > > +=cut
> > > > > +
> > > > > +1;
> > > > > diff --git a/lib/Mail/SpamAssassin/Pyzor/Client.pm 
> > > > > b/lib/Mail/SpamAssassin/Pyzor/Client.pm
> > > > > new file mode 100644
> > > > > index 0000000..ccff868
> > > > > --- /dev/null
> > > > > +++ b/lib/Mail/SpamAssassin/Pyzor/Client.pm
> > > > > @@ -0,0 +1,415 @@
> > > > > +package Mail::SpamAssassin::Pyzor::Client;
> > > > > +
> > > > > +# Copyright 2018 cPanel, LLC.
> > > > > +# All rights reserved.
> > > > > +# http://cpanel.net
> > > > > +#
> > > > > +# <@LICENSE>
> > > > > +# Licensed to the Apache Software Foundation (ASF) under one or more
> > > > > +# contributor license agreements.  See the NOTICE file distributed 
> > > > > with
> > > > > +# this work for additional information regarding copyright ownership.
> > > > > +# The ASF licenses this file to you under the Apache License, 
> > > > > Version 2.0
> > > > > +# (the "License"); you may not use this file except in compliance 
> > > > > with
> > > > > +# the License.  You may obtain a copy of the License at:
> > > > > +#
> > > > > +#     http://www.apache.org/licenses/LICENSE-2.0
> > > > > +#
> > > > > +# Unless required by applicable law or agreed to in writing, software
> > > > > +# distributed under the License is distributed on an "AS IS" BASIS,
> > > > > +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 
> > > > > implied.
> > > > > +# See the License for the specific language governing permissions and
> > > > > +# limitations under the License.
> > > > > +# </@LICENSE>
> > > > > +#
> > > > > +
> > > > > +use strict;
> > > > > +use warnings;
> > > > > +
> > > > > +=encoding utf-8
> > > > > +
> > > > > +=head1 NAME
> > > > > +
> > > > > +Mail::SpamAssassin::Pyzor::Client - Pyzor client logic
> > > > > +
> > > > > +=head1 SYNOPSIS
> > > > > +
> > > > > +    use Mail::SpamAssassin::Pyzor::Client ();
> > > > > +    use Mail::SpamAssassin::Pyzor::Digest ();
> > > > > +
> > > > > +    my $client = Mail::SpamAssassin::Pyzor::Client->new();
> > > > > +
> > > > > +    my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $msg );
> > > > > +
> > > > > +    my $check_ref = $client->check($digest);
> > > > > +    die $check_ref->{'Diag'} if $check_ref->{'Code'} ne '200';
> > > > > +
> > > > > +    my $report_ref = $client->report($digest);
> > > > > +    die $report_ref->{'Diag'} if $report_ref->{'Code'} ne '200';
> > > > > +
> > > > > +=head1 DESCRIPTION
> > > > > +
> > > > > +A bare-bones L<Pyzor|http://pyzor.org> client that currently only
> > > > > +implements the functionality needed for L<Mail::SpamAssassin>.
> > > > > +
> > > > > +=head1 PROTOCOL DETAILS
> > > > > +
> > > > > +The Pyzor protocol is not a published standard, and there appears to 
> > > > > be
> > > > > +no meaningful public documentation. What follows is enough 
> > > > > information,
> > > > > +largely gleaned through forum posts and reverse engineering, to 
> > > > > facilitate
> > > > > +effective use of this module:
> > > > > +
> > > > > +Pyzor is an RPC-oriented, message-based protocol. Each message
> > > > > +is a simple dictionary of 7-bit ASCII keys and values. Server 
> > > > > responses
> > > > > +always include at least the following:
> > > > > +
> > > > > +=over
> > > > > +
> > > > > +=item * C<Code> - Similar to HTTP status codes; anything besides 
> > > > > C<200>
> > > > > +is an error.
> > > > > +
> > > > > +=item * C<Diag> - Similar to HTTP status reasons: a text description
> > > > > +of the status.
> > > > > +
> > > > > +=back
> > > > > +
> > > > > +(NB: There are additional standard response headers that are useful 
> > > > > only for
> > > > > +the protocol itself and thus are not part of this module???s 
> > > > > returns.)
> > > > > +
> > > > > +=head2 Reliability
> > > > > +
> > > > > +Pyzor uses UDP rather than TCP, so no message is guaranteed to reach 
> > > > > its
> > > > > +destination. A transmission failure can happen in either the request 
> > > > > or
> > > > > +the response; in either case, a timeout error will result. Such 
> > > > > errors
> > > > > +are represented as thrown instances of L<Mail::Pyzor::X::Timeout>.
> > > > > +
> > > > > +=cut
> > > > > +
> > > > > +#----------------------------------------------------------------------
> > > > > +
> > > > > +our $VERSION = '0.04';
> > > > > +
> > > > > +our $DEFAULT_SERVER_HOST    = 'public.pyzor.org';
> > > > > +our $DEFAULT_SERVER_PORT    = 24441;
> > > > > +our $DEFAULT_USERNAME       = 'anonymous';
> > > > > +our $DEFAULT_PASSWORD       = '';
> > > > > +our $DEFAULT_OP_SPEC        = '20,3,60,3';
> > > > > +our $PYZOR_PROTOCOL_VERSION = 2.1;
> > > > > +our $DEFAULT_TIMEOUT        = 3.5;
> > > > > +our $READ_SIZE              = 8192;
> > > > > +
> > > > > +use IO::Socket::INET ();
> > > > > +use Digest::SHA qw(sha1 sha1_hex);
> > > > > +
> > > > > +my @hash_order = ( 'Op', 'Op-Digest', 'Op-Spec', 'Thread', 'PV', 
> > > > > 'User', 'Time', 'Sig' );
> > > > > +
> > > > > +#----------------------------------------------------------------------
> > > > > +
> > > > > +=head1 CONSTRUCTOR
> > > > > +
> > > > > +=head2 new(%OPTS)
> > > > > +
> > > > > +Create a new pyzor client.
> > > > > +
> > > > > +=over 2
> > > > > +
> > > > > +=item Input
> > > > > +
> > > > > +%OPTS are (all optional):
> > > > > +
> > > > > +=over 3
> > > > > +
> > > > > +=item * C<server_host> - The pyzor server host to connect to 
> > > > > (default is
> > > > > +C<public.pyzor.org>)
> > > > > +
> > > > > +=item * C<server_port> - The pyzor server port to connect to 
> > > > > (default is
> > > > > +24441)
> > > > > +
> > > > > +=item * C<username> - The username to present to the pyzor server 
> > > > > (default
> > > > > +is C<anonymous>)
> > > > > +
> > > > > +=item * C<password> - The password to present to the pyzor server 
> > > > > (default
> > > > > +is empty)
> > > > > +
> > > > > +=item * C<timeout> - The maximum time, in seconds, to wait for a 
> > > > > response
> > > > > +from the pyzor server (defeault is 3.5)
> > > > > +
> > > > > +=back
> > > > > +
> > > > > +=item Output
> > > > > +
> > > > > +=over 3
> > > > > +
> > > > > +Returns a L<Mail::SpamAssassin::Pyzor::Client> object.
> > > > > +
> > > > > +=back
> > > > > +
> > > > > +=back
> > > > > +
> > > > > +=cut
> > > > > +
> > > > > +sub new {
> > > > > +    my ( $class, %OPTS ) = @_;
> > > > > +
> > > > > +    return bless {
> > > > > +        '_server_host' => $OPTS{'server_host'} || 
> > > > > $DEFAULT_SERVER_HOST,
> > > > > +        '_server_port' => $OPTS{'server_port'} || 
> > > > > $DEFAULT_SERVER_PORT,
> > > > > +        '_username'    => $OPTS{'username'}    || $DEFAULT_USERNAME,
> > > > > +        '_password'    => $OPTS{'password'}    || $DEFAULT_PASSWORD,
> > > > > +        '_op_spec'     => $DEFAULT_OP_SPEC,
> > > > > +        '_timeout'     => $OPTS{'timeout'} || $DEFAULT_TIMEOUT,
> > > > > +    }, $class;
> > > > > +}
> > > > > +
> > > > > +#----------------------------------------------------------------------
> > > > > +
> > > > > +=head1 REQUEST METHODS
> > > > > +
> > > > > +=head2 report($digest)
> > > > > +
> > > > > +Report the digest of a spam message to the pyzor server. This 
> > > > > function
> > > > > +will throw if a messaging failure or timeout happens.
> > > > > +
> > > > > +=over 2
> > > > > +
> > > > > +=item Input
> > > > > +
> > > > > +=over 3
> > > > > +
> > > > > +=item $digest C<SCALAR>
> > > > > +
> > > > > +The message digest to report, as given by
> > > > > +C<Mail::SpamAssassin::Pyzor::Digest::get()>.
> > > > > +
> > > > > +=back
> > > > > +
> > > > > +=item Output
> > > > > +
> > > > > +=over 3
> > > > > +
> > > > > +=item C<HASHREF>
> > > > > +
> > > > > +Returns a hashref of the standard attributes noted above.
> > > > > +
> > > > > +=back
> > > > > +
> > > > > +=back
> > > > > +
> > > > > +=cut
> > > > > +
> > > > > +sub report {
> > > > > +    my ( $self, $digest ) = @_;
> > > > > +
> > > > > +    my $msg_ref = $self->_get_base_msg( 'report', $digest );
> > > > > +
> > > > > +    $msg_ref->{'Op-Spec'} = $self->{'_op_spec'};
> > > > > +
> > > > > +    return $self->_send_receive_msg($msg_ref);
> > > > > +}
> > > > > +
> > > > > +=head2 check($digest)
> > > > > +
> > > > > +Check the digest of a message to see if
> > > > > +the pyzor server has a report for it. This function
> > > > > +will throw if a messaging failure or timeout happens.
> > > > > +
> > > > > +=over 2
> > > > > +
> > > > > +=item Input
> > > > > +
> > > > > +=over 3
> > > > > +
> > > > > +=item $digest C<SCALAR>
> > > > > +
> > > > > +The message digest to check, as given by
> > > > > +C<Mail::SpamAssassin::Pyzor::Digest::get()>.
> > > > > +
> > > > > +=back
> > > > > +
> > > > > +=item Output
> > > > > +
> > > > > +=over 3
> > > > > +
> > > > > +=item C<HASHREF>
> > > > > +
> > > > > +Returns a hashref of the standard attributes noted above
> > > > > +as well as the following:
> > > > > +
> > > > > +=over
> > > > > +
> > > > > +=item * C<Count> - The number of reports the server has received
> > > > > +for the given digest.
> > > > > +
> > > > > +=item * C<WL-Count> - The number of whitelist requests the server 
> > > > > has received
> > > > > +for the given digest.
> > > > > +
> > > > > +=back
> > > > > +
> > > > > +=back
> > > > > +
> > > > > +=back
> > > > > +
> > > > > +=cut
> > > > > +
> > > > > +sub check {
> > > > > +    my ( $self, $digest ) = @_;
> > > > > +
> > > > > +    return $self->_send_receive_msg( $self->_get_base_msg( 'check', 
> > > > > $digest ) );
> > > > > +}
> > > > > +
> > > > > +# ----------------------------------------
> > > > > +
> > > > > +sub _send_receive_msg {
> > > > > +    my ( $self, $msg_ref ) = @_;
> > > > > +
> > > > > +    my $thread_id = $msg_ref->{'Thread'} or warn 'No thread ID?';
> > > > > +
> > > > > +    $self->_sign_msg($msg_ref);
> > > > > +
> > > > > +    return $self->_do_send_receive(
> > > > > +        $self->_generate_packet_from_message($msg_ref) . "\n\n",
> > > > > +        $thread_id,
> > > > > +    );
> > > > > +}
> > > > > +
> > > > > +sub _get_base_msg {
> > > > > +    my ( $self, $op, $digest ) = @_;
> > > > > +
> > > > > +    die "Implementor error: op is required" if !$op;
> > > > > +    die "error: digest is required"         if !$digest;
> > > > > +
> > > > > +    return {
> > > > > +        'User'      => $self->{'_username'},
> > > > > +        'PV'        => $PYZOR_PROTOCOL_VERSION,
> > > > > +        'Time'      => time(),
> > > > > +        'Op'        => $op,
> > > > > +        'Op-Digest' => $digest,
> > > > > +        'Thread'    => $self->_generate_thread_id()
> > > > > +    };
> > > > > +}
> > > > > +
> > > > > +sub _do_send_receive {
> > > > > +    my ( $self, $packet, $thread_id ) = @_;
> > > > > +
> > > > > +    my $sock = $self->_get_connection_or_die();
> > > > > +
> > > > > +    $self->_send_packet( $sock, $packet );
> > > > > +    my $response = $self->_receive_packet( $sock, $thread_id );
> > > > > +
> > > > > +    return 0 if not defined $response;
> > > > > +
> > > > > +    my $resp_hr = { map { ( split(m{: }) )[ 0, 1 ] } split( m{\n}, 
> > > > > $response ) };
> > > > > +
> > > > > +    delete $resp_hr->{'Thread'};
> > > > > +
> > > > > +    my $response_pv = delete $resp_hr->{'PV'};
> > > > > +
> > > > > +    if ( $PYZOR_PROTOCOL_VERSION ne $response_pv ) {
> > > > > +        warn "Unexpected protocol version ($response_pv) in Pyzor 
> > > > > response!";
> > > > > +    }
> > > > > +
> > > > > +    return $resp_hr;
> > > > > +}
> > > > > +
> > > > > +sub _receive_packet {
> > > > > +    my ( $self, $sock, $thread_id ) = @_;
> > > > > +
> > > > > +    my $timeout = $self->{'_timeout'} * 1000;
> > > > > +
> > > > > +    my $end_time = time + $self->{'_timeout'};
> > > > > +
> > > > > +    $sock->blocking(0);
> > > > > +    my $response = '';
> > > > > +    my $rout     = '';
> > > > > +    my $rin      = '';
> > > > > +    vec( $rin, fileno($sock), 1 ) = 1;
> > > > > +
> > > > > +    while (1) {
> > > > > +        my $time_left = $end_time - time;
> > > > > +
> > > > > +        if ( $time_left <= 0 ) {
> > > > > +          warn("Did not receive a response from the pyzor server 
> > > > > $self->{'_server_host'}:$self->{'_server_port'} for 
> > > > > $self->{'_timeout'} seconds!");
> > > > > +          return;
> > > > > +        }
> > > > > +
> > > > > +        my $bytes = sysread( $sock, $response, $READ_SIZE, length 
> > > > > $response );
> > > > > +        if ( !defined($bytes) && !$!{'EAGAIN'} && !$!{'EWOULDBLOCK'} 
> > > > > ) {
> > > > > +            warn "read from socket: $!";
> > > > > +        }
> > > > > +
> > > > > +        if ( index( $response, "\n\n" ) > -1 ) {
> > > > > +
> > > > > +            # Reject the response unless its thread ID matches what 
> > > > > we sent.
> > > > > +            # This prevents confusion among concurrent Pyzor 
> > > > > reqeusts.
> > > > > +            if ( index( $response, "\nThread: $thread_id\n" ) != -1 
> > > > > ) {
> > > > > +                last;
> > > > > +            }
> > > > > +            else {
> > > > > +                $response = '';
> > > > > +            }
> > > > > +        }
> > > > > +
> > > > > +        my $found = select( $rout = $rin, undef, undef, $time_left );
> > > > > +        warn "select(): $!" if $found == -1;
> > > > > +    }
> > > > > +
> > > > > +    return $response;
> > > > > +}
> > > > > +
> > > > > +sub _send_packet {
> > > > > +    my ( $self, $sock, $packet ) = @_;
> > > > > +
> > > > > +    $sock->blocking(1);
> > > > > +    syswrite( $sock, $packet ) or warn "write to socket: $!";
> > > > > +
> > > > > +    return;
> > > > > +}
> > > > > +
> > > > > +sub _get_connection_or_die {
> > > > > +    my ($self) = @_;
> > > > > +
> > > > > +    # clear the socket if the PID changes
> > > > > +    if ( defined $self->{'_sock_pid'} && $self->{'_sock_pid'} != $$ 
> > > > > ) {
> > > > > +        undef $self->{'_sock_pid'};
> > > > > +        undef $self->{'_sock'};
> > > > > +    }
> > > > > +
> > > > > +    $self->{'_sock_pid'} ||= $$;
> > > > > +    $self->{'_sock'}     ||= IO::Socket::INET->new(
> > > > > +        'PeerHost' => $self->{'_server_host'},
> > > > > +        'PeerPort' => $self->{'_server_port'},
> > > > > +        'Proto'    => 'udp'
> > > > > +    ) or die "Cannot connect to 
> > > > > $self->{'_server_host'}:$self->{'_server_port'}: $@ $!";
> > > > > +
> > > > > +    return $self->{'_sock'};
> > > > > +}
> > > > > +
> > > > > +sub _sign_msg {
> > > > > +    my ( $self, $msg_ref ) = @_;
> > > > > +
> > > > > +    $msg_ref->{'Sig'} = lc Digest::SHA::sha1_hex(
> > > > > +        Digest::SHA::sha1( 
> > > > > $self->_generate_packet_from_message($msg_ref) )
> > > > > +    );
> > > > > +
> > > > > +    return 1;
> > > > > +}
> > > > > +
> > > > > +sub _generate_packet_from_message {
> > > > > +    my ( $self, $msg_ref ) = @_;
> > > > > +
> > > > > +    return join( "\n", map { "$_: $msg_ref->{$_}" } grep { length 
> > > > > $msg_ref->{$_} } @hash_order );
> > > > > +}
> > > > > +
> > > > > +sub _generate_thread_id {
> > > > > +    my $RAND_MAX = 2**16;
> > > > > +    my $val      = 0;
> > > > > +    $val = int rand($RAND_MAX) while $val < 1024;
> > > > > +    return $val;
> > > > > +}
> > > > > +
> > > > > +sub _get_user_pass_hash_key {
> > > > > +    my ($self) = @_;
> > > > > +
> > > > > +    return lc Digest::SHA::sha1_hex( $self->{'_username'} . ':' . 
> > > > > $self->{'_password'} );
> > > > > +}
> > > > > +
> > > > > +1;
> > > > > diff --git a/lib/Mail/SpamAssassin/Pyzor/Digest.pm 
> > > > > b/lib/Mail/SpamAssassin/Pyzor/Digest.pm
> > > > > new file mode 100644
> > > > > index 0000000..0e8a5ae
> > > > > --- /dev/null
> > > > > +++ b/lib/Mail/SpamAssassin/Pyzor/Digest.pm
> > > > > @@ -0,0 +1,103 @@
> > > > > +package Mail::SpamAssassin::Pyzor::Digest;
> > > > > +
> > > > > +# Copyright 2018 cPanel, LLC.
> > > > > +# All rights reserved.
> > > > > +# http://cpanel.net
> > > > > +#
> > > > > +# <@LICENSE>
> > > > > +# Licensed to the Apache Software Foundation (ASF) under one or more
> > > > > +# contributor license agreements.  See the NOTICE file distributed 
> > > > > with
> > > > > +# this work for additional information regarding copyright ownership.
> > > > > +# The ASF licenses this file to you under the Apache License, 
> > > > > Version 2.0
> > > > > +# (the "License"); you may not use this file except in compliance 
> > > > > with
> > > > > +# the License.  You may obtain a copy of the License at:
> > > > > +#
> > > > > +#     http://www.apache.org/licenses/LICENSE-2.0
> > > > > +#
> > > > > +# Unless required by applicable law or agreed to in writing, software
> > > > > +# distributed under the License is distributed on an "AS IS" BASIS,
> > > > > +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 
> > > > > implied.
> > > > > +# See the License for the specific language governing permissions and
> > > > > +# limitations under the License.
> > > > > +# </@LICENSE>
> > > > > +#
> > > > > +
> > > > > +use strict;
> > > > > +use warnings;
> > > > > +
> > > > > +=encoding utf-8
> > > > > +
> > > > > +=head1 NAME
> > > > > +
> > > > > +Mail::SpamAssassin::Pyzor::Digest
> > > > > +
> > > > > +=head1 SYNOPSIS
> > > > > +
> > > > > +    my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $mime_text 
> > > > > );
> > > > > +
> > > > > +=head1 DESCRIPTION
> > > > > +
> > > > > +A reimplementation of 
> > > > > L<https://github.com/SpamExperts/pyzor/blob/master/pyzor/digest.py>.
> > > > > +
> > > > > +=cut
> > > > > +
> > > > > +#----------------------------------------------------------------------
> > > > > +
> > > > > +use Email::MIME ();
> > > > > +
> > > > > +use Mail::SpamAssassin::Pyzor::Digest::Pieces ();
> > > > > +use Digest::SHA qw(sha1_hex);
> > > > > +
> > > > > +our $VERSION = '0.03';
> > > > > +
> > > > > +#----------------------------------------------------------------------
> > > > > +
> > > > > +=head1 FUNCTIONS
> > > > > +
> > > > > +=head2 $hex = get( $MSG )
> > > > > +
> > > > > +This takes an email message in raw MIME text format (i.e., as saved 
> > > > > in the
> > > > > +standard mbox format) and returns the message???s Pyzor digest in 
> > > > > lower-case
> > > > > +hexadecimal.
> > > > > +
> > > > > +The output from this function should normally be identical to that of
> > > > > +the C<pyzor> script???s C<digest> command. It is suitable for use in
> > > > > +L<Mail::SpamAssassin::Pyzor::Client>???s request methods.
> > > > > +
> > > > > +=cut
> > > > > +
> > > > > +sub get {
> > > > > +    my ($text) = @_;
> > > > > +    return Digest::SHA::sha1_hex( ${ _get_predigest( $text ) } );
> > > > > +}
> > > > > +
> > > > > +# NB: This is called from the test.
> > > > > +sub _get_predigest {    ## no critic qw(RequireArgUnpacking)
> > > > > +    my ($msg_text_sr) = @_;
> > > > > +
> > > > > +    my $parsed = Email::MIME->new($$msg_text_sr);
> > > > > +
> > > > > +    my @lines;
> > > > > +
> > > > > +    my $payloads_ar = 
> > > > > Mail::SpamAssassin::Pyzor::Digest::Pieces::digest_payloads($parsed);
> > > > > +
> > > > > +    for my $payload (@$payloads_ar) {
> > > > > +        my @p_lines = 
> > > > > Mail::SpamAssassin::Pyzor::Digest::Pieces::splitlines($payload);
> > > > > +        for my $line (@p_lines) {
> > > > > +            
> > > > > Mail::SpamAssassin::Pyzor::Digest::Pieces::normalize($line);
> > > > > +
> > > > > +            next if 
> > > > > !Mail::SpamAssassin::Pyzor::Digest::Pieces::should_handle_line($line);
> > > > > +
> > > > > +            # Make sure we have an octet string.
> > > > > +            utf8::encode($line) if utf8::is_utf8($line);
> > > > > +
> > > > > +            push @lines, $line;
> > > > > +        }
> > > > > +    }
> > > > > +
> > > > > +    my $digest_sr = 
> > > > > Mail::SpamAssassin::Pyzor::Digest::Pieces::assemble_lines( \@lines );
> > > > > +
> > > > > +    return $digest_sr;
> > > > > +}
> > > > > +
> > > > > +1;
> > > > > diff --git a/lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm 
> > > > > b/lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm
> > > > > new file mode 100644
> > > > > index 0000000..522accd
> > > > > --- /dev/null
> > > > > +++ b/lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm
> > > > > @@ -0,0 +1,301 @@
> > > > > +package Mail::SpamAssassin::Pyzor::Digest::Pieces;
> > > > > +
> > > > > +# Copyright 2018 cPanel, LLC.
> > > > > +# All rights reserved.
> > > > > +# http://cpanel.net
> > > > > +#
> > > > > +# <@LICENSE>
> > > > > +# Licensed to the Apache Software Foundation (ASF) under one or more
> > > > > +# contributor license agreements.  See the NOTICE file distributed 
> > > > > with
> > > > > +# this work for additional information regarding copyright ownership.
> > > > > +# The ASF licenses this file to you under the Apache License, 
> > > > > Version 2.0
> > > > > +# (the "License"); you may not use this file except in compliance 
> > > > > with
> > > > > +# the License.  You may obtain a copy of the License at:
> > > > > +#
> > > > > +#     http://www.apache.org/licenses/LICENSE-2.0
> > > > > +#
> > > > > +# Unless required by applicable law or agreed to in writing, software
> > > > > +# distributed under the License is distributed on an "AS IS" BASIS,
> > > > > +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 
> > > > > implied.
> > > > > +# See the License for the specific language governing permissions and
> > > > > +# limitations under the License.
> > > > > +# </@LICENSE>
> > > > > +#
> > > > > +
> > > > > +use strict;
> > > > > +use warnings;
> > > > > +
> > > > > +=encoding utf-8
> > > > > +
> > > > > +=head1 NAME
> > > > > +
> > > > > +Mail::SpamAssassin::Pyzor::Digest::Pieces
> > > > > +
> > > > > +=head1 DESCRIPTION
> > > > > +
> > > > > +This module houses backend logic for 
> > > > > L<Mail::SpamAssassin::Pyzor::Digest>.
> > > > > +
> > > > > +It reimplements logic found in pyzor???s F<digest.py> module
> > > > > +(L<https://github.com/SpamExperts/pyzor/blob/master/pyzor/digest.py>).
> > > > > +
> > > > > +=cut
> > > > > +
> > > > > +#----------------------------------------------------------------------
> > > > > +
> > > > > +use Email::MIME::ContentType ();
> > > > > +use Encode                   ();
> > > > > +
> > > > > +our $VERSION = '0.03';
> > > > > +
> > > > > +# each tuple is [ offset, length ]
> > > > > +use constant _HASH_SPEC => ( [ 20, 3 ], [ 60, 3 ] );
> > > > > +
> > > > > +use constant {
> > > > > +    _MIN_LINE_LENGTH => 8,
> > > > > +
> > > > > +    _ATOMIC_NUM_LINES => 4,
> > > > > +};
> > > > > +
> > > > > +#----------------------------------------------------------------------
> > > > > +
> > > > > +=head1 FUNCTIONS
> > > > > +
> > > > > +=head2 $strings_ar = digest_payloads( $EMAIL_MIME )
> > > > > +
> > > > > +This imitates the corresponding object method in F<digest.py>.
> > > > > +It returns a reference to an array of strings. Each string can be 
> > > > > either
> > > > > +a byte string or a character string (e.g., UTF-8 decoded).
> > > > > +
> > > > > +NB: RFC 2822 stipulates that message bodies should use CRLF
> > > > > +line breaks, not plain LF (nor plain CR). L<Email::MIME::Encodings>
> > > > > +will thus convert any plain CRs in a quoted-printable message
> > > > > +body into CRLF. Python, though, doesn???t do this, so the output of
> > > > > +our implementation of C<digest_payloads()> diverges from that of the 
> > > > > Python
> > > > > +original. It doesn???t ultimately make a difference since the 
> > > > > line-ending
> > > > > +whitespace gets trimmed regardless, but it???s necessary to factor 
> > > > > in when
> > > > > +comparing the output of our implementation with the Python output.
> > > > > +
> > > > > +=cut
> > > > > +
> > > > > +sub digest_payloads {
> > > > > +    my ($parsed) = @_;
> > > > > +
> > > > > +    my @subparts = $parsed->subparts();
> > > > > +
> > > > > +    my @payloads;
> > > > > +
> > > > > +    if (@subparts) {
> > > > > +        @payloads = map { @{ digest_payloads($_) } } 
> > > > > $parsed->subparts();
> > > > > +    }
> > > > > +    else {
> > > > > +        my ( $main_type, $subtype, $encoding, $encode_check ) = 
> > > > > parse_content_type( $parsed->content_type() );
> > > > > +
> > > > > +        my $payload;
> > > > > +
> > > > > +        if ( $main_type eq 'text' ) {
> > > > > +
> > > > > +            # Decode transfer encoding, but leave us as a byte 
> > > > > string.
> > > > > +            # Note that this is where Email::MIME converts plain LF 
> > > > > to CRLF.
> > > > > +            $payload = $parsed->body();
> > > > > +
> > > > > +            # This does the actual character decoding (i.e., 
> > > > > ???charset???).
> > > > > +            $payload = Encode::decode( $encoding, $payload, 
> > > > > $encode_check );
> > > > > +
> > > > > +            if ( $subtype eq 'html' ) {
> > > > > +                require Mail::SpamAssassin::Pyzor::Digest::StripHtml;
> > > > > +                $payload = 
> > > > > Mail::SpamAssassin::Pyzor::Digest::StripHtml::strip($payload);
> > > > > +            }
> > > > > +        }
> > > > > +        else {
> > > > > +
> > > > > +            # This does no decoding, even of, e.g., quoted-printable 
> > > > > or base64.
> > > > > +            $payload = $parsed->body_raw();
> > > > > +        }
> > > > > +
> > > > > +        push @payloads, $payload;
> > > > > +    }
> > > > > +
> > > > > +    return \@payloads;
> > > > > +}
> > > > > +
> > > > > +#----------------------------------------------------------------------
> > > > > +
> > > > > +=head2 normalize( $STRING )
> > > > > +
> > > > > +This imitates the corresponding object method in F<digest.py>.
> > > > > +It modifies C<$STRING> in-place.
> > > > > +
> > > > > +As with the original implementation, if C<$STRING> contains (decoded)
> > > > > +Unicode characters, those characters will be parsed accordingly. So:
> > > > > +
> > > > > +    $str = "123\xc2\xa0";   # [ c2 a0 ] == \u00a0, non-breaking space
> > > > > +
> > > > > +    normalize($str);
> > > > > +
> > > > > +The above will leave C<$str> alone, but this:
> > > > > +
> > > > > +    utf8::decode($str);
> > > > > +
> > > > > +    normalize($str);
> > > > > +
> > > > > +??? will trim off the last two bytes from C<$str>.
> > > > > +
> > > > > +=cut
> > > > > +
> > > > > +sub normalize {    ## no critic qw( Subroutines::RequireArgUnpacking 
> > > > > )
> > > > > +
> > > > > +    # NULs are bad, mm-kay?
> > > > > +    $_[0] =~ tr<\0><>d;
> > > > > +
> > > > > +    # NB: Python???s \s without re.UNICODE is the same as Perl???s \s
> > > > > +    # with the /a modifier.
> > > > > +    #
> > > > > +    # https://docs.python.org/2/library/re.html
> > > > > +    # 
> > > > > https://perldoc.perl.org/perlrecharclass.html#Backslash-sequences
> > > > > +
> > > > > +    # Python: re.compile(r'\S{10,}')
> > > > > +    $_[0] =~ s<\S{10,}><>ag;
> > > > > +
> > > > > +    # Python: re.compile(r'\S+@\S+')
> > > > > +    $_[0] =~ s<\S+ @ \S+><>agx;
> > > > > +
> > > > > +    # Python: re.compile(r'[a-z]+:\S+', re.IGNORECASE)
> > > > > +    $_[0] =~ s<[a-zA-Z]+ : \S+><>agx;
> > > > > +
> > > > > +    # (from digest.py ???)
> > > > > +    # Make sure we do the whitespace last because some of the 
> > > > > previous
> > > > > +    # patterns rely on whitespace.
> > > > > +    $_[0] =~ tr< \x09-\x0d><>d;
> > > > > +
> > > > > +    # This is fun. digest.py???s normalize() does a non-UNICODE 
> > > > > whitespace
> > > > > +    # strip, then calls strip() on the string, which *will* strip 
> > > > > Unicode
> > > > > +    # whitespace from the ends.
> > > > > +    $_[0] =~ s<\A\s+><>;
> > > > > +    $_[0] =~ s<\s+\z><>;
> > > > > +
> > > > > +    return;
> > > > > +}
> > > > > +
> > > > > +#----------------------------------------------------------------------
> > > > > +
> > > > > +=head2 $yn = should_handle_line( $STRING )
> > > > > +
> > > > > +This imitates the corresponding object method in F<digest.py>.
> > > > > +It returns a boolean.
> > > > > +
> > > > > +=cut
> > > > > +
> > > > > +sub should_handle_line {
> > > > > +    return $_[0] && length( $_[0] ) >= _MIN_LINE_LENGTH();
> > > > > +}
> > > > > +
> > > > > +#----------------------------------------------------------------------
> > > > > +
> > > > > +=head2 $sr = assemble_lines( \@LINES )
> > > > > +
> > > > > +This assembles a string buffer out of @LINES. The string is the 
> > > > > buffer
> > > > > +of octets that will be hashed to produce the message digest.
> > > > > +
> > > > > +Each member of @LINES is expected to be an B<octet string>, not a
> > > > > +character string.
> > > > > +
> > > > > +=cut
> > > > > +
> > > > > +sub assemble_lines {
> > > > > +    my ($lines_ar) = @_;
> > > > > +
> > > > > +    if ( @$lines_ar <= _ATOMIC_NUM_LINES() ) {
> > > > > +
> > > > > +        # cf. handle_atomic() in digest.py
> > > > > +        return \join( q<>, @$lines_ar );
> > > > > +    }
> > > > > +
> > > > > +    
> > > > > #----------------------------------------------------------------------
> > > > > +    # cf. handle_atomic() in digest.py
> > > > > +
> > > > > +    my $str = q<>;
> > > > > +
> > > > > +    for my $ofs_len ( _HASH_SPEC() ) {
> > > > > +        my ( $offset, $length ) = @$ofs_len;
> > > > > +
> > > > > +        for my $i ( 0 .. ( $length - 1 ) ) {
> > > > > +            my $idx = int( $offset * @$lines_ar / 100 ) + $i;
> > > > > +
> > > > > +            next if !defined $lines_ar->[$idx];
> > > > > +
> > > > > +            $str .= $lines_ar->[$idx];
> > > > > +        }
> > > > > +    }
> > > > > +
> > > > > +    return \$str;
> > > > > +}
> > > > > +
> > > > > +#----------------------------------------------------------------------
> > > > > +
> > > > > +=head2 ($main, $sub, $encoding, $checkval) = parse_content_type( 
> > > > > $CONTENT_TYPE )
> > > > > +
> > > > > +=cut
> > > > > +
> > > > > +use constant _QUOTED_PRINTABLE_NAMES => (
> > > > > +    "quopri-codec",
> > > > > +    "quopri",
> > > > > +    "quoted-printable",
> > > > > +    "quotedprintable",
> > > > > +);
> > > > > +
> > > > > +# Make Encode::decode() ignore anything that doesn???t fit the
> > > > > +# given encoding.
> > > > > +use constant _encode_check_ignore => q<>;
> > > > > +
> > > > > +sub parse_content_type {
> > > > > +    my ($content_type) = @_;
> > > > > +
> > > > > +    $Email::MIME::ContentType::STRICT_PARAMS = 0;
> > > > > +    my $ct_parse = Email::MIME::ContentType::parse_content_type(
> > > > > +        $content_type,
> > > > > +    );
> > > > > +
> > > > > +    my $main = $ct_parse->{'type'}    || q<>;
> > > > > +    my $sub  = $ct_parse->{'subtype'} || q<>;
> > > > > +
> > > > > +    my $encoding = $ct_parse->{'attributes'}{'charset'};
> > > > > +
> > > > > +    my $checkval;
> > > > > +
> > > > > +    if ($encoding) {
> > > > > +
> > > > > +        # Lower-case everything, convert underscore to dash, and 
> > > > > remove NUL.
> > > > > +        $encoding =~ tr<A-Z_\0><a-z->d;
> > > > > +
> > > > > +        # Apparently pyzor accommodates messages that put the 
> > > > > transfer
> > > > > +        # encoding in the Content-Type.
> > > > > +        if ( grep { $_ eq $encoding } _QUOTED_PRINTABLE_NAMES() ) {
> > > > > +            $checkval = Encode::FB_CROAK();
> > > > > +        }
> > > > > +    }
> > > > > +    else {
> > > > > +        $encoding = 'ascii';
> > > > > +    }
> > > > > +
> > > > > +    # Match Python .decode()???s 'ignore' behavior
> > > > > +    $checkval ||= \&_encode_check_ignore;
> > > > > +
> > > > > +    return ( $main, $sub, $encoding, $checkval );
> > > > > +}
> > > > > +
> > > > > +#----------------------------------------------------------------------
> > > > > +
> > > > > +=head2 @lines = splitlines( $TEXT )
> > > > > +
> > > > > +Imitates C<str.splitlines()>. (cf. C<pydoc str>)
> > > > > +
> > > > > +Returns a plain list in list context. Returns the number of
> > > > > +items to be returned in scalar context.
> > > > > +
> > > > > +=cut
> > > > > +
> > > > > +sub splitlines {
> > > > > +    return split m<\r\n?|\n>, $_[0];
> > > > > +}
> > > > > +
> > > > > +1;
> > > > > diff --git a/lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm 
> > > > > b/lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm
> > > > > new file mode 100644
> > > > > index 0000000..2617b4a
> > > > > --- /dev/null
> > > > > +++ b/lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm
> > > > > @@ -0,0 +1,177 @@
> > > > > +package Mail::SpamAssassin::Pyzor::Digest::StripHtml;
> > > > > +
> > > > > +# Copyright 2018 cPanel, LLC.
> > > > > +# All rights reserved.
> > > > > +# http://cpanel.net
> > > > > +#
> > > > > +# <@LICENSE>
> > > > > +# Licensed to the Apache Software Foundation (ASF) under one or more
> > > > > +# contributor license agreements.  See the NOTICE file distributed 
> > > > > with
> > > > > +# this work for additional information regarding copyright ownership.
> > > > > +# The ASF licenses this file to you under the Apache License, 
> > > > > Version 2.0
> > > > > +# (the "License"); you may not use this file except in compliance 
> > > > > with
> > > > > +# the License.  You may obtain a copy of the License at:
> > > > > +#
> > > > > +#     http://www.apache.org/licenses/LICENSE-2.0
> > > > > +#
> > > > > +# Unless required by applicable law or agreed to in writing, software
> > > > > +# distributed under the License is distributed on an "AS IS" BASIS,
> > > > > +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 
> > > > > implied.
> > > > > +# See the License for the specific language governing permissions and
> > > > > +# limitations under the License.
> > > > > +# </@LICENSE>
> > > > > +#
> > > > > +
> > > > > +use strict;
> > > > > +use warnings;
> > > > > +
> > > > > +=encoding utf-8
> > > > > +
> > > > > +=head1 NAME
> > > > > +
> > > > > +Mail::SpamAssassin::Pyzor::Digest::StripHtml
> > > > > +
> > > > > +=head1 SYNOPSIS
> > > > > +
> > > > > +    my $stripped = 
> > > > > Mail::SpamAssassin::Pyzor::Digest::StripHtml::strip($html);
> > > > > +
> > > > > +=head1 DESCRIPTION
> > > > > +
> > > > > +This module attempts to duplicate pyzor???s HTML-stripping logic.
> > > > > +
> > > > > +=head1 ACCURACY
> > > > > +
> > > > > +This library cannot achieve 100%, bug-for-bug parity with pyzor
> > > > > +because to do so would require duplicating Python???s own HTML 
> > > > > parsing
> > > > > +library. Since that library???s output has changed over time, and 
> > > > > those
> > > > > +changes in turn affect pyzor, it???s literally impossible to arrive 
> > > > > at
> > > > > +a single, fully-compatible reimplementation.
> > > > > +
> > > > > +That said, all known divergences between pyzor and this library 
> > > > > involve
> > > > > +invalid HTML as input.
> > > > > +
> > > > > +Please open bug reports for any divergences you identify, 
> > > > > particularly
> > > > > +if the input is valid HTML.
> > > > > +
> > > > > +=cut
> > > > > +
> > > > > +#----------------------------------------------------------------------
> > > > > +
> > > > > +use HTML::Parser ();
> > > > > +
> > > > > +our $VERSION = '0.03';
> > > > > +
> > > > > +#----------------------------------------------------------------------
> > > > > +
> > > > > +=head1 FUNCTIONS
> > > > > +
> > > > > +=head2 $stripped = strip( $HTML )
> > > > > +
> > > > > +Give it some HTML, and it???ll give back the stripped text.
> > > > > +
> > > > > +In B<general>, the stripping consists of removing tags as well as
> > > > > +C<E<lt>scriptE<gt>> and C<E<lt>styleE<gt>> elements; however, it also
> > > > > +removes HTML entities.
> > > > > +
> > > > > +This tries very hard to duplicate pyzor???s behavior with invalid 
> > > > > HTML.
> > > > > +
> > > > > +=cut
> > > > > +
> > > > > +sub strip {
> > > > > +    my ($html) = @_;
> > > > > +
> > > > > +    $html =~ s<\A\s+><>;
> > > > > +    $html =~ s<\s+\z><>;
> > > > > +
> > > > > +    my $p = HTML::Parser->new( api_version => 3 );
> > > > > +
> > > > > +    my @pieces;
> > > > > +
> > > > > +    my $accumulate = 1;
> > > > > +
> > > > > +    $p->handler(
> > > > > +        start => sub {
> > > > > +            my ($tagname) = @_;
> > > > > +
> > > > > +            $accumulate = 0 if $tagname eq 'script';
> > > > > +            $accumulate = 0 if $tagname eq 'style';
> > > > > +
> > > > > +            return;
> > > > > +        },
> > > > > +        'tagname',
> > > > > +    );
> > > > > +
> > > > > +    $p->handler(
> > > > > +        end => sub {
> > > > > +            $accumulate = 1;
> > > > > +            return;
> > > > > +        }
> > > > > +    );
> > > > > +
> > > > > +    $p->handler(
> > > > > +        text => sub {
> > > > > +            my ($copy) = @_;
> > > > > +
> > > > > +            return if !$accumulate;
> > > > > +
> > > > > +            # pyzor???s HTML parser discards HTML entities. On top 
> > > > > of that,
> > > > > +            # we need to match, as closely as possible, pyzor???s 
> > > > > handling of
> > > > > +            # invalid HTML entities ??? which is a function of 
> > > > > Python???s
> > > > > +            # standard HTML parsing library. This will probably 
> > > > > never be
> > > > > +            # fully compatible with the pyzor, but we can get it 
> > > > > close.
> > > > > +
> > > > > +            # The original is:
> > > > > +            #
> > > > > +            #   
> > > > > re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
> > > > > +            #
> > > > > +            # The parsing loop then ???backs up??? one byte if the 
> > > > > last
> > > > > +            # character isn???t a ???;???. We use a look-ahead 
> > > > > assertion to
> > > > > +            # mimic that behavior.
> > > > > +            $copy =~ s<\&\# (?:[0-9]+ | [xX][0-9a-fA-F]+) (?: ; | \z 
> > > > > | (?=[^0-9a-fA-F]) )>< >gx;
> > > > > +
> > > > > +            # The original is:
> > > > > +            #
> > > > > +            #   re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
> > > > > +            #
> > > > > +            # We again use a look-ahead assertion to mimic Python.
> > > > > +            $copy =~ s<\& [a-zA-Z] [-.a-zA-Z0-9]* (?: ; | \z | 
> > > > > (?=[^a-zA-Z0-9]) )>< >gx;
> > > > > +
> > > > > +            # Python???s HTMLParser aborts its parsing loop when it 
> > > > > encounters
> > > > > +            # an invalid numeric reference.
> > > > > +            $copy =~ s<\&\#
> > > > > +                (?:
> > > > > +                    [^0-9xX]        # anything but the expected 
> > > > > first char
> > > > > +                    |
> > > > > +                    [0-9]+[a-fA-F]  # hex within decimal
> > > > > +                    |
> > > > > +                    [xX][^0-9a-fA-F]
> > > > > +                )
> > > > > +                (.*)
> > > > > +            ><
> > > > > +                ( -1 == index($1, ';') ) ? q<> : '&#'
> > > > > +            >exs;
> > > > > +
> > > > > +            # Python???s HTMLParser treats invalid entities as 
> > > > > incomplete
> > > > > +            $copy =~ s<(\&\#?)><$1 >gx;
> > > > > +
> > > > > +            $copy =~ s<\A\s+><>;
> > > > > +            $copy =~ s<\s+\z><>;
> > > > > +
> > > > > +            push @pieces, \$copy if length $copy;
> > > > > +        },
> > > > > +        'text,tagname',
> > > > > +    );
> > > > > +
> > > > > +    $p->parse($html);
> > > > > +    $p->eof();
> > > > > +
> > > > > +    my $payload = join( q< >, map { $$_ } @pieces );
> > > > > +
> > > > > +    # Convert all sequences of whitespace OTHER THAN non-breaking 
> > > > > spaces to
> > > > > +    # plain spaces.
> > > > > +    $payload =~ s<[^\S\x{a0}]+>< >g;
> > > > > +
> > > > > +    return $payload;
> > > > > +}
> > > > > +
> > > > > +1;
> > > > > diff --git a/t/pyzor.t b/t/pyzor.t
> > > > > index 891f38d..e4ef83f 100755
> > > > > --- a/t/pyzor.t
> > > > > +++ b/t/pyzor.t
> > > > > @@ -3,12 +3,9 @@
> > > > >   use lib '.'; use lib 't';
> > > > >   use SATest; sa_t_init("pyzor");
> > > > > -use constant HAS_PYZOR => eval { $_ = untaint_cmd("which pyzor"); 
> > > > > chomp; -x };
> > > > > -
> > > > >   use Test::More;
> > > > >   plan skip_all => "Net tests disabled" unless 
> > > > > conf_bool('run_net_tests');
> > > > > -plan skip_all => "Pyzor executable not found in path" unless 
> > > > > HAS_PYZOR;
> > > > > -plan tests => 8;
> > > > > +plan tests => 5;
> > > > >   diag('Note: Failures may not be an SpamAssassin bug, as Pyzor tests 
> > > > > can fail due to problems with the Pyzor servers.');
> > > > > @@ -30,7 +27,7 @@ tstprefs ("
> > > > >   sarun ("-t < data/spam/pyzor", \&patterns_run_cb);
> > > > >   ok_all_patterns();
> > > > >   # Same with fork
> > > > > -sarun ("--cf='pyzor_fork 1' -t < data/spam/pyzor", 
> > > > > \&patterns_run_cb);
> > > > > +sarun ("-t < data/spam/pyzor", \&patterns_run_cb);
> > > > >   ok_all_patterns();
> > > > >   #TESTING FOR HAM
> > > > > @@ -44,7 +41,3 @@ ok_all_patterns();
> > > > >   sarun ("-D pyzor -t < data/nice/001 2>&1", \&patterns_run_cb);
> > > > >   ok_all_patterns();
> > > > > -# same with fork
> > > > > -sarun ("-D pyzor --cf='pyzor_fork 1' -t < data/nice/001 2>&1", 
> > > > > \&patterns_run_cb);
> > > > > -ok_all_patterns();
> > > > > -
> > > > 
> > 
> -- 
> Kevin A. McGrail
> kmcgr...@apache.org
> 
> Member, Apache Software Foundation
> Chair Emeritus Apache SpamAssassin Project
> https://www.linkedin.com/in/kmcgrail - 703.798.0171

Reply via email to