Hi,
cPanel has developed a native Perl Pyzor implementation for SpamAssassin
and a diff against SpamAssassin 4.0 follows.
Atm I am using it in production on a small server, more tests and
opinions are welcome.

Original cPanel code is at https://metacpan.org/pod/Mail::Pyzor.

 Cheers
  Giovanni

diff --git a/MANIFEST b/MANIFEST
index 25d0192..2d9588c 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -126,6 +126,11 @@ lib/Mail/SpamAssassin/Plugin/WLBLEval.pm
 lib/Mail/SpamAssassin/Plugin/WhiteListSubject.pm
 lib/Mail/SpamAssassin/PluginHandler.pm
 lib/Mail/SpamAssassin/Plugin/URILocalBL.pm
+lib/Mail/SpamAssassin/Pyzor/Client.pm
+lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm
+lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm
+lib/Mail/SpamAssassin/Pyzor/Digest.pm
+lib/Mail/SpamAssassin/Pyzor.pm
 lib/Mail/SpamAssassin/RegistryBoundaries.pm
 lib/Mail/SpamAssassin/Reporter.pm
 lib/Mail/SpamAssassin/SQLBasedAddrList.pm
diff --git a/lib/Mail/SpamAssassin/Plugin/Pyzor.pm 
b/lib/Mail/SpamAssassin/Plugin/Pyzor.pm
index 3efd4b4..e4c9c05 100644
--- a/lib/Mail/SpamAssassin/Plugin/Pyzor.pm
+++ b/lib/Mail/SpamAssassin/Plugin/Pyzor.pm
@@ -36,17 +36,13 @@ package Mail::SpamAssassin::Plugin::Pyzor;
 
 use Mail::SpamAssassin::Plugin;
 use Mail::SpamAssassin::Logger;
-use Mail::SpamAssassin::Timeout;
-use Mail::SpamAssassin::Util qw(untaint_var untaint_file_path
-                                proc_status_ok exit_status_str);
+use Mail::SpamAssassin::Util qw(untaint_var);
+
 use strict;
 use warnings;
 # use bytes;
 use re 'taint';
 
-use Storable;
-use POSIX qw(PIPE_BUF WNOHANG _exit);
-
 our @ISA = qw(Mail::SpamAssassin::Plugin);
 
 sub new {
@@ -78,7 +74,7 @@ sub set_config {
   my ($self, $conf) = @_;
   my @cmds;
 
-=head1 USER OPTIONS
+=head1 ADMINISTRATOR OPTIONS
 
 =over 4
 
@@ -95,22 +91,7 @@ Whether to use Pyzor, if it is available.
     type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL
   });
 
-=item pyzor_fork (0|1)         (default: 0)
-
-Instead of running Pyzor synchronously, fork separate process for it and
-read the results in later (similar to async DNS lookups).  Increases
-throughput.  Experimental.
-
-=cut
-
-  push(@cmds, {
-    setting => 'pyzor_fork',
-    is_admin => 1,
-    default => 0,
-    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
-  });
-
-=item pyzor_count_min NUMBER   (default: 5)
+=item pyzor_count_min NUMBER           (default: 5)
 
 This option sets how often a message's body checksum must have been
 reported to the Pyzor server before SpamAssassin will consider the Pyzor
@@ -128,54 +109,8 @@ set this to a relatively low value, e.g. C<5>.
     type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
   });
 
-  # Deprecated setting, the name makes no sense!
-  push (@cmds, {
-    setting => 'pyzor_max',
-    is_admin => 1,
-    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
-    code => sub {
-      my ($self, $key, $value, $line) = @_;
-      warn("deprecated setting used, change pyzor_max to pyzor_count_min\n");
-      if ($value !~ /^\d+$/) {
-        return $Mail::SpamAssassin::Conf::INVALID_VALUE;
-      }
-      $self->{pyzor_count_min} = $value;
-    }
-  });
-
-=item pyzor_whitelist_min NUMBER       (default: 10)
-
-This option sets how often a message's body checksum must have been
-whitelisted to the Pyzor server for SpamAssassin to consider ignoring the
-result.  Final decision is made by pyzor_whitelist_factor.
-
-=cut
-
-  push (@cmds, {
-    setting => 'pyzor_whitelist_min',
-    is_admin => 1,
-    default => 10,
-    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
-  });
-
-=item pyzor_whitelist_factor NUMBER    (default: 0.2)
-
-Ignore Pyzor result if REPORTCOUNT x NUMBER >= pyzor_whitelist_min.
-For default setting this means: 50 reports requires 10 whitelistings.
-
-=cut
-
-  push (@cmds, {
-    setting => 'pyzor_whitelist_factor',
-    is_admin => 1,
-    default => 0.2,
-    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
-  });
-
 =back
 
-=head1 ADMINISTRATOR OPTIONS
-
 =over 4
 
 =item pyzor_timeout n          (default: 5)
@@ -210,478 +145,182 @@ removing one of them.
     type => $Mail::SpamAssassin::Conf::CONF_TYPE_DURATION
   });
 
-=item pyzor_options options
+=item pyzor_whitelist_min NUMBER        (default: 10)
 
-Specify additional options to the pyzor(1) command. Please note that only
-characters in the range [0-9A-Za-z =,._/-] are allowed for security reasons.
+This option sets how often a message's body checksum must have been
+whitelisted to the Pyzor server for SpamAssassin to consider ignoring the
+result.  Final decision is made by pyzor_whitelist_factor.
 
 =cut
 
   push (@cmds, {
-    setting => 'pyzor_options',
+    setting => 'pyzor_whitelist_min',
     is_admin => 1,
-    default => '',
-    type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING,
-    code => sub {
-      my ($self, $key, $value, $line) = @_;
-      if ($value !~ m{^([0-9A-Za-z =,._/-]+)$}) {
-       return $Mail::SpamAssassin::Conf::INVALID_VALUE;
-      }
-      $self->{pyzor_options} = $1;
-    }
+    default => 10,
+    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
   });
 
-=item pyzor_path STRING
+=item pyzor_whitelist_factor NUMBER     (default: 0.2)
 
-This option tells SpamAssassin specifically where to find the C<pyzor>
-client instead of relying on SpamAssassin to find it in the current
-PATH.  Note that if I<taint mode> is enabled in the Perl interpreter,
-you should use this, as the current PATH will have been cleared.
+Ignore Pyzor result if REPORTCOUNT x NUMBER >= pyzor_whitelist_min.
+For default setting this means: 50 reports requires 10 whitelistings.
 
 =cut
 
   push (@cmds, {
-    setting => 'pyzor_path',
+    setting => 'pyzor_whitelist_factor',
     is_admin => 1,
-    default => undef,
-    type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING,
-    code => sub {
-      my ($self, $key, $value, $line) = @_;
-      if (!defined $value || !length $value) {
-       return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE;
-      }
-      $value = untaint_file_path($value);
-      if (!-x $value) {
-       info("config: pyzor_path \"$value\" isn't an executable");
-       return $Mail::SpamAssassin::Conf::INVALID_VALUE;
-      }
-
-      $self->{pyzor_path} = $value;
-    }
+    default => 0.2,
+    type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
   });
 
   $conf->{parser}->register_commands(\@cmds);
 }
 
 sub is_pyzor_available {
-  my ($self) = @_;
+    my ($self) = @_;
 
-  my $pyzor = $self->{main}->{conf}->{pyzor_path} ||
-    Mail::SpamAssassin::Util::find_executable_in_env_path('pyzor');
-
-  unless ($pyzor && -x $pyzor) {
-    dbg("pyzor: no pyzor executable found");
-    $self->{pyzor_available} = 0;
-    return 0;
-  }
-
-  # remember any found pyzor
-  $self->{main}->{conf}->{pyzor_path} = $pyzor;
-
-  dbg("pyzor: pyzor is available: $pyzor");
-  return 1;
+    local $@;
+    eval {
+        require Mail::SpamAssassin::Pyzor::Digest;
+        require Mail::SpamAssassin::Pyzor::Client;
+    };
+    return $@ ? 0 : 1;
 }
 
-sub finish_parsing_start {
-  my ($self, $opts) = @_;
+sub get_pyzor_interface {
+  my ($self) = @_;
 
-  # If forking, hard adjust priority -100 to launch early
-  # Find rulenames from eval_to_rule mappings
-  if ($opts->{conf}->{pyzor_fork}) {
-    foreach (@{$opts->{conf}->{eval_to_rule}->{check_pyzor}}) {
-      dbg("pyzor: adjusting rule $_ priority to -100");
-      $opts->{conf}->{priority}->{$_} = -100;
-    }
+  if (!$self->{main}->{conf}->{use_pyzor}) {
+    dbg("pyzor: use_pyzor option not enabled, disabling Pyzor");
+    $self->{pyzor_interface} = "disabled";
+    $self->{pyzor_available} = 0;
+  }
+  elsif ($self->is_pyzor_available()) {
+    $self->{pyzor_interface} = "pyzor";
+    $self->{pyzor_available} = 1;
+  }
+  else {
+    dbg("pyzor: no pyzor found, disabling Pyzor");
+    $self->{pyzor_available} = 0;
   }
 }
 
 sub check_pyzor {
-  my ($self, $pms, $full) = @_;
-
-  return 0 if !$self->{pyzor_available};
-  return 0 if !$self->{main}->{conf}->{use_pyzor};
-
-  return 0 if $pms->{pyzor_running};
-  $pms->{pyzor_running} = 1;
-
-  return 0 if !$self->is_pyzor_available();
-
-  my $timer = $self->{main}->time_method("check_pyzor");
+  my ($self, $permsgstatus, $full) = @_;
 
   # initialize valid tags
-  $pms->{tag_data}->{PYZOR} = '';
-
-  # create fulltext tmpfile now (before possible forking)
-  $pms->{pyzor_tmpfile} = $pms->create_fulltext_tmpfile();
-
-  ## non-forking method
-
-  if (!$self->{main}->{conf}->{pyzor_fork}) {
-    my @results = $self->pyzor_lookup($pms);
-    return $self->_check_result($pms, \@results);
-  }
-
-  ## forking method
-
-  $pms->{pyzor_rulename} = $pms->get_current_eval_rule_name();
-  $pms->rule_pending($pms->{pyzor_rulename}); # mark async
-
-  # create socketpair for communication
-  $pms->{pyzor_backchannel} = Mail::SpamAssassin::SubProcBackChannel->new();
-  my $back_selector = '';
-  $pms->{pyzor_backchannel}->set_selector(\$back_selector);
-  eval {
-    $pms->{pyzor_backchannel}->setup_backchannel_parent_pre_fork();
-  } or do {
-    dbg("pyzor: backchannel pre-setup failed: $@");
-    delete $pms->{pyzor_backchannel};
-    return 0;
-  };
+  $permsgstatus->{tag_data}->{PYZOR} = "";
 
-  my $pid = fork();
-  if (!defined $pid) {
-    info("pyzor: child fork failed: $!");
-    delete $pms->{pyzor_backchannel};
-    return 0;
-  }
-  if (!$pid) {
-    $0 = "$0 (pyzor)";
-    $SIG{CHLD} = 'DEFAULT';
-    $SIG{PIPE} = 'IGNORE';
-    $SIG{$_} = sub {
-      eval { dbg("pyzor: child process $$ caught signal $_[0]"); };
-      _exit(6);  # avoid END and destructor processing
-      kill('KILL',$$);  # still kicking? die!
-      } foreach qw(INT HUP TERM TSTP QUIT USR1 USR2);
-    dbg("pyzor: child process $$ forked");
-    $pms->{pyzor_backchannel}->setup_backchannel_child_post_fork();
-    my @results = $self->pyzor_lookup($pms);
-    my $backmsg;
-    eval {
-      $backmsg = Storable::freeze(\@results);
-    };
-    if ($@) {
-      dbg("pyzor: child return value freeze failed: $@");
-      _exit(0); # avoid END and destructor processing
-    }
-    if (!syswrite($pms->{pyzor_backchannel}->{parent}, $backmsg)) {
-      dbg("pyzor: child backchannel write failed: $!");
-    }
-    _exit(0); # avoid END and destructor processing
-  }
-
-  $pms->{pyzor_pid} = $pid;
+  my $timer = $self->{main}->time_method("check_pyzor");
 
-  eval {
-    $pms->{pyzor_backchannel}->setup_backchannel_parent_post_fork($pid);
-  } or do {
-    dbg("pyzor: backchannel post-setup failed: $@");
-    delete $pms->{pyzor_backchannel};
-    return 0;
-  };
+  $self->get_pyzor_interface();
+  return 0 unless $self->{pyzor_available};
 
-  return 0;
+  return $self->pyzor_lookup($permsgstatus, $full);
 }
 
 sub pyzor_lookup {
-  my ($self, $pms) = @_;
-
-  my $conf = $self->{main}->{conf};
-  my $timeout = $conf->{pyzor_timeout};
-
-  # note: not really tainted, this came from system configuration file
-  my $path = untaint_file_path($conf->{pyzor_path});
-  my $opts = untaint_var($conf->{pyzor_options}) || '';
-
-  $pms->enter_helper_run_mode();
-
-  my $pid;
-  my @resp;
-  my $timer = Mail::SpamAssassin::Timeout->new(
-           { secs => $timeout, deadline => $pms->{master_deadline} });
-  my $err = $timer->run_and_catch(sub {
-    local $SIG{PIPE} = sub { die "__brokenpipe__ignore__\n" };
-
-    dbg("pyzor: opening pipe: ".
-      join(' ', $path, $opts, "check", "<".$pms->{pyzor_tmpfile}));
-
-    $pid = Mail::SpamAssassin::Util::helper_app_pipe_open(*PYZOR,
-       $pms->{pyzor_tmpfile}, 1, $path, split(' ', $opts), "check");
-    $pid or die "$!\n";
-
-    # read+split avoids a Perl I/O bug (Bug 5985)
-    my($inbuf, $nread);
-    my $resp = '';
-    while ($nread = read(PYZOR, $inbuf, 8192)) { $resp .= $inbuf }
-    defined $nread  or die "error reading from pipe: $!";
-    @resp = split(/^/m, $resp, -1);
-
-    my $errno = 0;
-    close PYZOR or $errno = $!;
-    if (proc_status_ok($?, $errno)) {
-      dbg("pyzor: [%s] finished successfully", $pid);
-    } elsif (proc_status_ok($?, $errno, 0, 1)) {  # sometimes it exits with 1
-      dbg("pyzor: [%s] finished: %s", $pid, exit_status_str($?, $errno));
-    } else {
-      info("pyzor: [%s] error: %s", $pid, exit_status_str($?, $errno));
-    }
-
-  });
-
-  if (defined(fileno(*PYZOR))) {  # still open
-    if ($pid) {
-      if (kill('TERM', $pid)) {
-        dbg("pyzor: killed stale helper [$pid]");
-      } else {
-        dbg("pyzor: killing helper application [$pid] failed: $!");
-      }
-    }
-    my $errno = 0;
-    close PYZOR or $errno = $!;
-    proc_status_ok($?, $errno)
-      or info("pyzor: [%s] error: %s", $pid, exit_status_str($?, $errno));
-  }
-
-  $pms->leave_helper_run_mode();
-
-  if ($timer->timed_out()) {
-    dbg("pyzor: check timed out after $timeout seconds");
-    return ();
-  } elsif ($err) {
-    chomp $err;
-    info("pyzor: check failed: $err");
-    return ();
-  }
-
-  return @resp;
-}
-
-sub check_tick {
-  my ($self, $opts) = @_;
-  $self->_check_forked_result($opts->{permsgstatus}, 0);
-}
-
-sub check_cleanup {
-  my ($self, $opts) = @_;
-  $self->_check_forked_result($opts->{permsgstatus}, 1);
-}
-
-sub _check_forked_result {
-  my ($self, $pms, $finish) = @_;
-
-  return 0 if !$pms->{pyzor_backchannel};
-  return 0 if !$pms->{pyzor_pid};
+    my ( $self, $permsgstatus, $fulltext ) = @_;
+    my $conf = $self->{main}->{conf};
+    my $timeout = $conf->{pyzor_timeout};
+
+    my $client = ( $self->{'_pyzor_client'} ||= 
Mail::SpamAssassin::Pyzor::Client->new( 'timeout' => $timeout ) );
+    my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $fulltext );
+
+    local $@;
+    my $ref = eval { $client->check($digest); };
+    dbg("pyzor: got response: $client->{'_server_host'}");
+    # $client reply must be an hash
+    return 0 if (not (ref $ref eq ref {}));
+    if ($@) {
+        my $err = $@;
 
-  my $timer = $self->{main}->time_method("check_pyzor");
+        $err = eval { $err->get_message() } || $err;
 
-  $pms->{pyzor_abort} = $pms->{deadline_exceeded} || $pms->{shortcircuited};
-
-  my $kid_pid = $pms->{pyzor_pid};
-  # if $finish, force waiting for the child
-  my $pid = waitpid($kid_pid, $finish && !$pms->{pyzor_abort} ? 0 : WNOHANG);
-  if ($pid == 0) {
-    #dbg("pyzor: child process $kid_pid not finished yet, trying later");
-    if ($pms->{pyzor_abort}) {
-      dbg("pyzor: bailing out due to deadline/shortcircuit");
-      kill('TERM', $kid_pid);
-      if (waitpid($kid_pid, WNOHANG) == 0) {
-        sleep(1);
-        if (waitpid($kid_pid, WNOHANG) == 0) {
-          dbg("pyzor: child process $kid_pid still alive, KILL");
-          kill('KILL', $kid_pid);
-          waitpid($kid_pid, 0);
+        warn("pyzor: check failed: $err\n");
+        return 0;
+    } elsif ( defined $ref->{'Code'} and $ref->{'Code'} ne 200 ) {
+        if(defined $ref->{'Code'} and defined $ref->{'Diag'}) {
+          dbg("pyzor: check failed with invalid code: $ref->{'Code'}: 
$ref->{'Diag'}");
+        } else {
+          dbg("pyzor: check failed with undefined code");
         }
-      }
-      delete $pms->{pyzor_pid};
-      delete $pms->{pyzor_backchannel};
+        return 0;
     }
-    return 0;
-  } elsif ($pid == -1) {
-    # child does not exist?
-    dbg("pyzor: child process $kid_pid already handled?");
-    delete $pms->{pyzor_backchannel};
-    return 0;
-  }
 
-  $pms->rule_ready($pms->{pyzor_rulename}); # mark rule ready for metas
+    my $pyzor_count       = untaint_var($ref->{'Count'}) + 0;
+    my $pyzor_whitelisted = untaint_var($ref->{'WL-Count'}) + 0;
+    my $count_min = $conf->{pyzor_count_min};
+    my $wl_min = $conf->{pyzor_whitelist_min};
 
-  dbg("pyzor: child process $kid_pid finished, reading results");
+    my $wl_limit = $pyzor_whitelisted >= $wl_min ?
+      $pyzor_count * $conf->{pyzor_whitelist_factor} : 0;
 
-  my $backmsg;
-  my $ret = sysread($pms->{pyzor_backchannel}->{latest_kid_fh}, $backmsg, 
PIPE_BUF);
-  if (!defined $ret || $ret == 0) {
-    dbg("pyzor: could not read result from child: ".($ret == 0 ? 0 : $!));
-    delete $pms->{pyzor_backchannel};
-    return 0;
-  }
-
-  delete $pms->{pyzor_backchannel};
+    $permsgstatus->set_tag('PYZOR', "Reported $pyzor_count times, whitelisted 
$pyzor_whitelisted times.");
 
-  my $results;
-  eval {
-    $results = Storable::thaw($backmsg);
-  };
-  if ($@) {
-    dbg("pyzor: child return value thaw failed: $@");
-    return;
-  }
-
-  $self->_check_result($pms, $results);
-}
+    dbg("pyzor: result: COUNT=$pyzor_count/$count_min 
WHITELIST=$pyzor_whitelisted/$wl_min/%.1f",
+      $wl_limit);
 
-sub _check_result {
-  my ($self, $pms, $results) = @_;
-
-  if (!@$results) {
-    dbg("pyzor: no response from server");
-    return 0;
-  }
-
-  my $count = 0;
-  my $count_wl = 0;
-  foreach my $res (@$results) {
-    chomp($res);
-    if ($res =~ /^Traceback/) {
-      info("pyzor: internal error, python traceback seen in response: $res");
+    # Empty body etc results in same hash, we should skip very large numbers..
+    if ($pyzor_count >= 1000000 || $pyzor_whitelisted >= 10000) {
+      dbg("pyzor: result exceeded hardcoded limits, ignoring: count/wl 
1000000/10000");
       return 0;
     }
-    dbg("pyzor: got response: $res");
-    # this regexp is intended to be a little bit forgiving
-    if ($res =~ /^\S+\t.*?\t(\d+)\t(\d+)\s*$/) {
-      # until pyzor servers can sync their DBs,
-      # sum counts obtained from all servers
-      $count += untaint_var($1)+0; # crazy but needs untainting
-      $count_wl += untaint_var($2)+0;
-    } else {
-      # warn on failures to parse
-      info("pyzor: failure to parse response \"$res\"");
-    }
-  }
-
-  my $conf = $self->{main}->{conf};
-
-  my $count_min = $conf->{pyzor_count_min};
-  my $wl_min = $conf->{pyzor_whitelist_min};
 
-  my $wl_limit = $count_wl >= $wl_min ?
-    $count * $conf->{pyzor_whitelist_factor} : 0;
-
-  dbg("pyzor: result: COUNT=$count/$count_min 
WHITELIST=$count_wl/$wl_min/%.1f",
-    $wl_limit);
-  $pms->set_tag('PYZOR', "Reported $count times, whitelisted $count_wl 
times.");
-
-  # Empty body etc results in same hash, we should skip very large numbers..
-  if ($count >= 1000000 || $count_wl >= 10000) {
-    dbg("pyzor: result exceeded hardcoded limits, ignoring: count/wl 
1000000/10000");
-    return 0;
-  }
-
-  # Whitelisted?
-  if ($wl_limit && $count_wl >= $wl_limit) {
-    dbg("pyzor: message whitelisted");
-    return 0;
-  }
+    # Whitelisted?
+    if ($wl_limit && $pyzor_whitelisted >= $wl_limit) {
+      dbg("pyzor: message whitelisted");
+      return 0;
+    }
 
-  if ($count >= $count_min) {
-    if ($conf->{pyzor_fork}) {
-      # forked needs to run got_hit()
-      $pms->got_hit($pms->{pyzor_rulename}, "", ruletype => 'eval');
+    if ( $pyzor_count >= $count_min ) {
+      return 1;
     }
-    return 1;
-  }
 
-  return 0;
+    return 0;
 }
 
 sub plugin_report {
   my ($self, $options) = @_;
 
-  return if !$self->{pyzor_available};
-  return if !$self->{main}->{conf}->{use_pyzor};
-  return if $options->{report}->{options}->{dont_report_to_pyzor};
-  return if !$self->is_pyzor_available();
-
-  # use temporary file: open2() is unreliable due to buffering under spamd
-  my $tmpf = $options->{report}->create_fulltext_tmpfile($options->{text});
-  if ($self->pyzor_report($options, $tmpf)) {
-    $options->{report}->{report_available} = 1;
-    info("reporter: spam reported to Pyzor");
-    $options->{report}->{report_return} = 1;
-  }
-  else {
-    info("reporter: could not report spam to Pyzor");
-  }
-  $options->{report}->delete_fulltext_tmpfile($tmpf);
+  return unless $self->{pyzor_available};
+  return unless $self->{main}->{conf}->{use_pyzor};
 
-  return 1;
+  if (!$options->{report}->{options}->{dont_report_to_pyzor} && 
$self->is_pyzor_available())
+  {
+    if ($self->pyzor_report($options)) {
+      $options->{report}->{report_available} = 1;
+      info("reporter: spam reported to Pyzor");
+      $options->{report}->{report_return} = 1;
+    }
+    else {
+      info("reporter: could not report spam to Pyzor");
+    }
+  }
 }
 
 sub pyzor_report {
-  my ($self, $options, $tmpf) = @_;
-
-  # note: not really tainted, this came from system configuration file
-  my $path = untaint_file_path($options->{report}->{conf}->{pyzor_path});
-  my $opts = untaint_var($options->{report}->{conf}->{pyzor_options}) || '';
+    my ( $self, $options ) = @_;
 
-  my $timeout = $self->{main}->{conf}->{pyzor_timeout};
+    my $timeout = $self->{main}->{conf}->{pyzor_timeout};
 
-  $options->{report}->enter_helper_run_mode();
+    my $client = ( $self->{'_pyzor_client'} ||= 
Mail::SpamAssassin::Pyzor::Client->new( 'timeout' => $timeout ) );
 
-  my $timer = Mail::SpamAssassin::Timeout->new({ secs => $timeout });
-  my $err = $timer->run_and_catch(sub {
+    my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $options->{'text'} );
 
-    local $SIG{PIPE} = sub { die "__brokenpipe__ignore__\n" };
-
-    dbg("pyzor: opening pipe: " . join(' ', $path, $opts, "report", "< 
$tmpf"));
-
-    my $pid = Mail::SpamAssassin::Util::helper_app_pipe_open(*PYZOR,
-       $tmpf, 1, $path, split(' ', $opts), "report");
-    $pid or die "$!\n";
-
-    my($inbuf,$nread,$nread_all); $nread_all = 0;
-    # response is ignored, just check its existence
-    while ( $nread=read(PYZOR,$inbuf,8192) ) { $nread_all += $nread }
-    defined $nread  or die "error reading from pipe: $!";
-
-    dbg("pyzor: empty response")  if $nread_all < 1;
-
-    my $errno = 0;  close PYZOR or $errno = $!;
-    # closing a pipe also waits for the process executing on the pipe to
-    # complete, no need to explicitly call waitpid
-    # my $child_stat = waitpid($pid,0) > 0 ? $? : undef;
-    if (proc_status_ok($?,$errno, 0)) {
-      dbg("pyzor: [%s] reporter finished successfully", $pid);
-    } else {
-      info("pyzor: [%s] reporter error: %s", $pid, exit_status_str($?,$errno));
+    local $@;
+    my $ref = eval { $client->report($digest); };
+    if ($@) {
+        warn("pyzor: report failed: $@");
+        return 0;
     }
-
-  });
-
-  $options->{report}->leave_helper_run_mode();
-
-  if ($timer->timed_out()) {
-    dbg("reporter: pyzor report timed out after $timeout seconds");
-    return 0;
-  }
-
-  if ($err) {
-    chomp $err;
-    if ($err eq '__brokenpipe__ignore__') {
-      dbg("reporter: pyzor report failed: broken pipe");
-    } else {
-      warn("reporter: pyzor report failed: $err\n");
+    elsif ( $ref->{'Code'} ne 200 ) {
+        dbg("pyzor: report failed with invalid code: $ref->{'Code'}: 
$ref->{'Diag'}");
+        return 0;
     }
-    return 0;
-  }
 
-  return 1;
+    return 1;
 }
 
-# Version features
-sub has_fork { 1 }
-
 1;
-
-=back
-
-=cut
diff --git a/lib/Mail/SpamAssassin/Pyzor.pm b/lib/Mail/SpamAssassin/Pyzor.pm
new file mode 100644
index 0000000..8ac27f4
--- /dev/null
+++ b/lib/Mail/SpamAssassin/Pyzor.pm
@@ -0,0 +1,56 @@
+package Mail::SpamAssassin::Pyzor;
+
+# Copyright 2018 cPanel, LLC.
+# All rights reserved.
+# http://cpanel.net
+#
+# <@LICENSE>
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at:
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </@LICENSE>
+#
+
+use strict;
+use warnings;
+
+our $VERSION = '0.06_01';
+
+=encoding utf-8
+
+=head1 NAME
+
+Mail::SpamAssassin::Pyzor - Pyzor spam filtering in Perl
+
+=head1 DESCRIPTION
+
+This distribution contains Perl implementations of parts of
+L<Pyzor|http://pyzor.org>, a tool for use in spam email filtering.
+It is intended for use with L<Mail::SpamAssassin> but may be useful
+in other contexts.
+
+See the following modules for information on specific tools that
+the distribution includes:
+
+=over
+
+=item * L<Mail::SpamAssassin::Pyzor::Client>
+
+=item * L<Mail::SpamAssassin::Pyzor::Digest>
+
+=back
+
+=cut
+
+1;
diff --git a/lib/Mail/SpamAssassin/Pyzor/Client.pm 
b/lib/Mail/SpamAssassin/Pyzor/Client.pm
new file mode 100644
index 0000000..ccff868
--- /dev/null
+++ b/lib/Mail/SpamAssassin/Pyzor/Client.pm
@@ -0,0 +1,415 @@
+package Mail::SpamAssassin::Pyzor::Client;
+
+# Copyright 2018 cPanel, LLC.
+# All rights reserved.
+# http://cpanel.net
+#
+# <@LICENSE>
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at:
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </@LICENSE>
+#
+
+use strict;
+use warnings;
+
+=encoding utf-8
+
+=head1 NAME
+
+Mail::SpamAssassin::Pyzor::Client - Pyzor client logic
+
+=head1 SYNOPSIS
+
+    use Mail::SpamAssassin::Pyzor::Client ();
+    use Mail::SpamAssassin::Pyzor::Digest ();
+
+    my $client = Mail::SpamAssassin::Pyzor::Client->new();
+
+    my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $msg );
+
+    my $check_ref = $client->check($digest);
+    die $check_ref->{'Diag'} if $check_ref->{'Code'} ne '200';
+
+    my $report_ref = $client->report($digest);
+    die $report_ref->{'Diag'} if $report_ref->{'Code'} ne '200';
+
+=head1 DESCRIPTION
+
+A bare-bones L<Pyzor|http://pyzor.org> client that currently only
+implements the functionality needed for L<Mail::SpamAssassin>.
+
+=head1 PROTOCOL DETAILS
+
+The Pyzor protocol is not a published standard, and there appears to be
+no meaningful public documentation. What follows is enough information,
+largely gleaned through forum posts and reverse engineering, to facilitate
+effective use of this module:
+
+Pyzor is an RPC-oriented, message-based protocol. Each message
+is a simple dictionary of 7-bit ASCII keys and values. Server responses
+always include at least the following:
+
+=over
+
+=item * C<Code> - Similar to HTTP status codes; anything besides C<200>
+is an error.
+
+=item * C<Diag> - Similar to HTTP status reasons: a text description
+of the status.
+
+=back
+
+(NB: There are additional standard response headers that are useful only for
+the protocol itself and thus are not part of this module’s returns.)
+
+=head2 Reliability
+
+Pyzor uses UDP rather than TCP, so no message is guaranteed to reach its
+destination. A transmission failure can happen in either the request or
+the response; in either case, a timeout error will result. Such errors
+are represented as thrown instances of L<Mail::Pyzor::X::Timeout>.
+
+=cut
+
+#----------------------------------------------------------------------
+
+our $VERSION = '0.04';
+
+our $DEFAULT_SERVER_HOST    = 'public.pyzor.org';
+our $DEFAULT_SERVER_PORT    = 24441;
+our $DEFAULT_USERNAME       = 'anonymous';
+our $DEFAULT_PASSWORD       = '';
+our $DEFAULT_OP_SPEC        = '20,3,60,3';
+our $PYZOR_PROTOCOL_VERSION = 2.1;
+our $DEFAULT_TIMEOUT        = 3.5;
+our $READ_SIZE              = 8192;
+
+use IO::Socket::INET ();
+use Digest::SHA qw(sha1 sha1_hex);
+
+my @hash_order = ( 'Op', 'Op-Digest', 'Op-Spec', 'Thread', 'PV', 'User', 
'Time', 'Sig' );
+
+#----------------------------------------------------------------------
+
+=head1 CONSTRUCTOR
+
+=head2 new(%OPTS)
+
+Create a new pyzor client.
+
+=over 2
+
+=item Input
+
+%OPTS are (all optional):
+
+=over 3
+
+=item * C<server_host> - The pyzor server host to connect to (default is
+C<public.pyzor.org>)
+
+=item * C<server_port> - The pyzor server port to connect to (default is
+24441)
+
+=item * C<username> - The username to present to the pyzor server (default
+is C<anonymous>)
+
+=item * C<password> - The password to present to the pyzor server (default
+is empty)
+
+=item * C<timeout> - The maximum time, in seconds, to wait for a response
+from the pyzor server (defeault is 3.5)
+
+=back
+
+=item Output
+
+=over 3
+
+Returns a L<Mail::SpamAssassin::Pyzor::Client> object.
+
+=back
+
+=back
+
+=cut
+
+sub new {
+    my ( $class, %OPTS ) = @_;
+
+    return bless {
+        '_server_host' => $OPTS{'server_host'} || $DEFAULT_SERVER_HOST,
+        '_server_port' => $OPTS{'server_port'} || $DEFAULT_SERVER_PORT,
+        '_username'    => $OPTS{'username'}    || $DEFAULT_USERNAME,
+        '_password'    => $OPTS{'password'}    || $DEFAULT_PASSWORD,
+        '_op_spec'     => $DEFAULT_OP_SPEC,
+        '_timeout'     => $OPTS{'timeout'} || $DEFAULT_TIMEOUT,
+    }, $class;
+}
+
+#----------------------------------------------------------------------
+
+=head1 REQUEST METHODS
+
+=head2 report($digest)
+
+Report the digest of a spam message to the pyzor server. This function
+will throw if a messaging failure or timeout happens.
+
+=over 2
+
+=item Input
+
+=over 3
+
+=item $digest C<SCALAR>
+
+The message digest to report, as given by
+C<Mail::SpamAssassin::Pyzor::Digest::get()>.
+
+=back
+
+=item Output
+
+=over 3
+
+=item C<HASHREF>
+
+Returns a hashref of the standard attributes noted above.
+
+=back
+
+=back
+
+=cut
+
+sub report {
+    my ( $self, $digest ) = @_;
+
+    my $msg_ref = $self->_get_base_msg( 'report', $digest );
+
+    $msg_ref->{'Op-Spec'} = $self->{'_op_spec'};
+
+    return $self->_send_receive_msg($msg_ref);
+}
+
+=head2 check($digest)
+
+Check the digest of a message to see if
+the pyzor server has a report for it. This function
+will throw if a messaging failure or timeout happens.
+
+=over 2
+
+=item Input
+
+=over 3
+
+=item $digest C<SCALAR>
+
+The message digest to check, as given by
+C<Mail::SpamAssassin::Pyzor::Digest::get()>.
+
+=back
+
+=item Output
+
+=over 3
+
+=item C<HASHREF>
+
+Returns a hashref of the standard attributes noted above
+as well as the following:
+
+=over
+
+=item * C<Count> - The number of reports the server has received
+for the given digest.
+
+=item * C<WL-Count> - The number of whitelist requests the server has received
+for the given digest.
+
+=back
+
+=back
+
+=back
+
+=cut
+
+sub check {
+    my ( $self, $digest ) = @_;
+
+    return $self->_send_receive_msg( $self->_get_base_msg( 'check', $digest ) 
);
+}
+
+# ----------------------------------------
+
+sub _send_receive_msg {
+    my ( $self, $msg_ref ) = @_;
+
+    my $thread_id = $msg_ref->{'Thread'} or warn 'No thread ID?';
+
+    $self->_sign_msg($msg_ref);
+
+    return $self->_do_send_receive(
+        $self->_generate_packet_from_message($msg_ref) . "\n\n",
+        $thread_id,
+    );
+}
+
+sub _get_base_msg {
+    my ( $self, $op, $digest ) = @_;
+
+    die "Implementor error: op is required" if !$op;
+    die "error: digest is required"         if !$digest;
+
+    return {
+        'User'      => $self->{'_username'},
+        'PV'        => $PYZOR_PROTOCOL_VERSION,
+        'Time'      => time(),
+        'Op'        => $op,
+        'Op-Digest' => $digest,
+        'Thread'    => $self->_generate_thread_id()
+    };
+}
+
+sub _do_send_receive {
+    my ( $self, $packet, $thread_id ) = @_;
+
+    my $sock = $self->_get_connection_or_die();
+
+    $self->_send_packet( $sock, $packet );
+    my $response = $self->_receive_packet( $sock, $thread_id );
+
+    return 0 if not defined $response;
+
+    my $resp_hr = { map { ( split(m{: }) )[ 0, 1 ] } split( m{\n}, $response ) 
};
+
+    delete $resp_hr->{'Thread'};
+
+    my $response_pv = delete $resp_hr->{'PV'};
+
+    if ( $PYZOR_PROTOCOL_VERSION ne $response_pv ) {
+        warn "Unexpected protocol version ($response_pv) in Pyzor response!";
+    }
+
+    return $resp_hr;
+}
+
+sub _receive_packet {
+    my ( $self, $sock, $thread_id ) = @_;
+
+    my $timeout = $self->{'_timeout'} * 1000;
+
+    my $end_time = time + $self->{'_timeout'};
+
+    $sock->blocking(0);
+    my $response = '';
+    my $rout     = '';
+    my $rin      = '';
+    vec( $rin, fileno($sock), 1 ) = 1;
+
+    while (1) {
+        my $time_left = $end_time - time;
+
+        if ( $time_left <= 0 ) {
+          warn("Did not receive a response from the pyzor server 
$self->{'_server_host'}:$self->{'_server_port'} for $self->{'_timeout'} 
seconds!");
+          return;
+        }
+
+        my $bytes = sysread( $sock, $response, $READ_SIZE, length $response );
+        if ( !defined($bytes) && !$!{'EAGAIN'} && !$!{'EWOULDBLOCK'} ) {
+            warn "read from socket: $!";
+        }
+
+        if ( index( $response, "\n\n" ) > -1 ) {
+
+            # Reject the response unless its thread ID matches what we sent.
+            # This prevents confusion among concurrent Pyzor reqeusts.
+            if ( index( $response, "\nThread: $thread_id\n" ) != -1 ) {
+                last;
+            }
+            else {
+                $response = '';
+            }
+        }
+
+        my $found = select( $rout = $rin, undef, undef, $time_left );
+        warn "select(): $!" if $found == -1;
+    }
+
+    return $response;
+}
+
+sub _send_packet {
+    my ( $self, $sock, $packet ) = @_;
+
+    $sock->blocking(1);
+    syswrite( $sock, $packet ) or warn "write to socket: $!";
+
+    return;
+}
+
+sub _get_connection_or_die {
+    my ($self) = @_;
+
+    # clear the socket if the PID changes
+    if ( defined $self->{'_sock_pid'} && $self->{'_sock_pid'} != $$ ) {
+        undef $self->{'_sock_pid'};
+        undef $self->{'_sock'};
+    }
+
+    $self->{'_sock_pid'} ||= $$;
+    $self->{'_sock'}     ||= IO::Socket::INET->new(
+        'PeerHost' => $self->{'_server_host'},
+        'PeerPort' => $self->{'_server_port'},
+        'Proto'    => 'udp'
+    ) or die "Cannot connect to 
$self->{'_server_host'}:$self->{'_server_port'}: $@ $!";
+
+    return $self->{'_sock'};
+}
+
+sub _sign_msg {
+    my ( $self, $msg_ref ) = @_;
+
+    $msg_ref->{'Sig'} = lc Digest::SHA::sha1_hex(
+        Digest::SHA::sha1( $self->_generate_packet_from_message($msg_ref) )
+    );
+
+    return 1;
+}
+
+sub _generate_packet_from_message {
+    my ( $self, $msg_ref ) = @_;
+
+    return join( "\n", map { "$_: $msg_ref->{$_}" } grep { length 
$msg_ref->{$_} } @hash_order );
+}
+
+sub _generate_thread_id {
+    my $RAND_MAX = 2**16;
+    my $val      = 0;
+    $val = int rand($RAND_MAX) while $val < 1024;
+    return $val;
+}
+
+sub _get_user_pass_hash_key {
+    my ($self) = @_;
+
+    return lc Digest::SHA::sha1_hex( $self->{'_username'} . ':' . 
$self->{'_password'} );
+}
+
+1;
diff --git a/lib/Mail/SpamAssassin/Pyzor/Digest.pm 
b/lib/Mail/SpamAssassin/Pyzor/Digest.pm
new file mode 100644
index 0000000..0e8a5ae
--- /dev/null
+++ b/lib/Mail/SpamAssassin/Pyzor/Digest.pm
@@ -0,0 +1,103 @@
+package Mail::SpamAssassin::Pyzor::Digest;
+
+# Copyright 2018 cPanel, LLC.
+# All rights reserved.
+# http://cpanel.net
+#
+# <@LICENSE>
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at:
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </@LICENSE>
+#
+
+use strict;
+use warnings;
+
+=encoding utf-8
+
+=head1 NAME
+
+Mail::SpamAssassin::Pyzor::Digest
+
+=head1 SYNOPSIS
+
+    my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $mime_text );
+
+=head1 DESCRIPTION
+
+A reimplementation of 
L<https://github.com/SpamExperts/pyzor/blob/master/pyzor/digest.py>.
+
+=cut
+
+#----------------------------------------------------------------------
+
+use Email::MIME ();
+
+use Mail::SpamAssassin::Pyzor::Digest::Pieces ();
+use Digest::SHA qw(sha1_hex);
+
+our $VERSION = '0.03';
+
+#----------------------------------------------------------------------
+
+=head1 FUNCTIONS
+
+=head2 $hex = get( $MSG )
+
+This takes an email message in raw MIME text format (i.e., as saved in the
+standard mbox format) and returns the message’s Pyzor digest in lower-case
+hexadecimal.
+
+The output from this function should normally be identical to that of
+the C<pyzor> script’s C<digest> command. It is suitable for use in
+L<Mail::SpamAssassin::Pyzor::Client>’s request methods.
+
+=cut
+
+sub get {
+    my ($text) = @_;
+    return Digest::SHA::sha1_hex( ${ _get_predigest( $text ) } );
+}
+
+# NB: This is called from the test.
+sub _get_predigest {    ## no critic qw(RequireArgUnpacking)
+    my ($msg_text_sr) = @_;
+
+    my $parsed = Email::MIME->new($$msg_text_sr);
+
+    my @lines;
+
+    my $payloads_ar = 
Mail::SpamAssassin::Pyzor::Digest::Pieces::digest_payloads($parsed);
+
+    for my $payload (@$payloads_ar) {
+        my @p_lines = 
Mail::SpamAssassin::Pyzor::Digest::Pieces::splitlines($payload);
+        for my $line (@p_lines) {
+            Mail::SpamAssassin::Pyzor::Digest::Pieces::normalize($line);
+
+            next if 
!Mail::SpamAssassin::Pyzor::Digest::Pieces::should_handle_line($line);
+
+            # Make sure we have an octet string.
+            utf8::encode($line) if utf8::is_utf8($line);
+
+            push @lines, $line;
+        }
+    }
+
+    my $digest_sr = Mail::SpamAssassin::Pyzor::Digest::Pieces::assemble_lines( 
\@lines );
+
+    return $digest_sr;
+}
+
+1;
diff --git a/lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm 
b/lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm
new file mode 100644
index 0000000..522accd
--- /dev/null
+++ b/lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm
@@ -0,0 +1,301 @@
+package Mail::SpamAssassin::Pyzor::Digest::Pieces;
+
+# Copyright 2018 cPanel, LLC.
+# All rights reserved.
+# http://cpanel.net
+#
+# <@LICENSE>
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at:
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </@LICENSE>
+#
+
+use strict;
+use warnings;
+
+=encoding utf-8
+
+=head1 NAME
+
+Mail::SpamAssassin::Pyzor::Digest::Pieces
+
+=head1 DESCRIPTION
+
+This module houses backend logic for L<Mail::SpamAssassin::Pyzor::Digest>.
+
+It reimplements logic found in pyzor’s F<digest.py> module
+(L<https://github.com/SpamExperts/pyzor/blob/master/pyzor/digest.py>).
+
+=cut
+
+#----------------------------------------------------------------------
+
+use Email::MIME::ContentType ();
+use Encode                   ();
+
+our $VERSION = '0.03';
+
+# each tuple is [ offset, length ]
+use constant _HASH_SPEC => ( [ 20, 3 ], [ 60, 3 ] );
+
+use constant {
+    _MIN_LINE_LENGTH => 8,
+
+    _ATOMIC_NUM_LINES => 4,
+};
+
+#----------------------------------------------------------------------
+
+=head1 FUNCTIONS
+
+=head2 $strings_ar = digest_payloads( $EMAIL_MIME )
+
+This imitates the corresponding object method in F<digest.py>.
+It returns a reference to an array of strings. Each string can be either
+a byte string or a character string (e.g., UTF-8 decoded).
+
+NB: RFC 2822 stipulates that message bodies should use CRLF
+line breaks, not plain LF (nor plain CR). L<Email::MIME::Encodings>
+will thus convert any plain CRs in a quoted-printable message
+body into CRLF. Python, though, doesn’t do this, so the output of
+our implementation of C<digest_payloads()> diverges from that of the Python
+original. It doesn’t ultimately make a difference since the line-ending
+whitespace gets trimmed regardless, but it’s necessary to factor in when
+comparing the output of our implementation with the Python output.
+
+=cut
+
+sub digest_payloads {
+    my ($parsed) = @_;
+
+    my @subparts = $parsed->subparts();
+
+    my @payloads;
+
+    if (@subparts) {
+        @payloads = map { @{ digest_payloads($_) } } $parsed->subparts();
+    }
+    else {
+        my ( $main_type, $subtype, $encoding, $encode_check ) = 
parse_content_type( $parsed->content_type() );
+
+        my $payload;
+
+        if ( $main_type eq 'text' ) {
+
+            # Decode transfer encoding, but leave us as a byte string.
+            # Note that this is where Email::MIME converts plain LF to CRLF.
+            $payload = $parsed->body();
+
+            # This does the actual character decoding (i.e., “charset”).
+            $payload = Encode::decode( $encoding, $payload, $encode_check );
+
+            if ( $subtype eq 'html' ) {
+                require Mail::SpamAssassin::Pyzor::Digest::StripHtml;
+                $payload = 
Mail::SpamAssassin::Pyzor::Digest::StripHtml::strip($payload);
+            }
+        }
+        else {
+
+            # This does no decoding, even of, e.g., quoted-printable or base64.
+            $payload = $parsed->body_raw();
+        }
+
+        push @payloads, $payload;
+    }
+
+    return \@payloads;
+}
+
+#----------------------------------------------------------------------
+
+=head2 normalize( $STRING )
+
+This imitates the corresponding object method in F<digest.py>.
+It modifies C<$STRING> in-place.
+
+As with the original implementation, if C<$STRING> contains (decoded)
+Unicode characters, those characters will be parsed accordingly. So:
+
+    $str = "123\xc2\xa0";   # [ c2 a0 ] == \u00a0, non-breaking space
+
+    normalize($str);
+
+The above will leave C<$str> alone, but this:
+
+    utf8::decode($str);
+
+    normalize($str);
+
+… will trim off the last two bytes from C<$str>.
+
+=cut
+
+sub normalize {    ## no critic qw( Subroutines::RequireArgUnpacking )
+
+    # NULs are bad, mm-kay?
+    $_[0] =~ tr<\0><>d;
+
+    # NB: Python’s \s without re.UNICODE is the same as Perl’s \s
+    # with the /a modifier.
+    #
+    # https://docs.python.org/2/library/re.html
+    # https://perldoc.perl.org/perlrecharclass.html#Backslash-sequences
+
+    # Python: re.compile(r'\S{10,}')
+    $_[0] =~ s<\S{10,}><>ag;
+
+    # Python: re.compile(r'\S+@\S+')
+    $_[0] =~ s<\S+ @ \S+><>agx;
+
+    # Python: re.compile(r'[a-z]+:\S+', re.IGNORECASE)
+    $_[0] =~ s<[a-zA-Z]+ : \S+><>agx;
+
+    # (from digest.py …)
+    # Make sure we do the whitespace last because some of the previous
+    # patterns rely on whitespace.
+    $_[0] =~ tr< \x09-\x0d><>d;
+
+    # This is fun. digest.py’s normalize() does a non-UNICODE whitespace
+    # strip, then calls strip() on the string, which *will* strip Unicode
+    # whitespace from the ends.
+    $_[0] =~ s<\A\s+><>;
+    $_[0] =~ s<\s+\z><>;
+
+    return;
+}
+
+#----------------------------------------------------------------------
+
+=head2 $yn = should_handle_line( $STRING )
+
+This imitates the corresponding object method in F<digest.py>.
+It returns a boolean.
+
+=cut
+
+sub should_handle_line {
+    return $_[0] && length( $_[0] ) >= _MIN_LINE_LENGTH();
+}
+
+#----------------------------------------------------------------------
+
+=head2 $sr = assemble_lines( \@LINES )
+
+This assembles a string buffer out of @LINES. The string is the buffer
+of octets that will be hashed to produce the message digest.
+
+Each member of @LINES is expected to be an B<octet string>, not a
+character string.
+
+=cut
+
+sub assemble_lines {
+    my ($lines_ar) = @_;
+
+    if ( @$lines_ar <= _ATOMIC_NUM_LINES() ) {
+
+        # cf. handle_atomic() in digest.py
+        return \join( q<>, @$lines_ar );
+    }
+
+    #----------------------------------------------------------------------
+    # cf. handle_atomic() in digest.py
+
+    my $str = q<>;
+
+    for my $ofs_len ( _HASH_SPEC() ) {
+        my ( $offset, $length ) = @$ofs_len;
+
+        for my $i ( 0 .. ( $length - 1 ) ) {
+            my $idx = int( $offset * @$lines_ar / 100 ) + $i;
+
+            next if !defined $lines_ar->[$idx];
+
+            $str .= $lines_ar->[$idx];
+        }
+    }
+
+    return \$str;
+}
+
+#----------------------------------------------------------------------
+
+=head2 ($main, $sub, $encoding, $checkval) = parse_content_type( $CONTENT_TYPE 
)
+
+=cut
+
+use constant _QUOTED_PRINTABLE_NAMES => (
+    "quopri-codec",
+    "quopri",
+    "quoted-printable",
+    "quotedprintable",
+);
+
+# Make Encode::decode() ignore anything that doesn’t fit the
+# given encoding.
+use constant _encode_check_ignore => q<>;
+
+sub parse_content_type {
+    my ($content_type) = @_;
+
+    $Email::MIME::ContentType::STRICT_PARAMS = 0;
+    my $ct_parse = Email::MIME::ContentType::parse_content_type(
+        $content_type,
+    );
+
+    my $main = $ct_parse->{'type'}    || q<>;
+    my $sub  = $ct_parse->{'subtype'} || q<>;
+
+    my $encoding = $ct_parse->{'attributes'}{'charset'};
+
+    my $checkval;
+
+    if ($encoding) {
+
+        # Lower-case everything, convert underscore to dash, and remove NUL.
+        $encoding =~ tr<A-Z_\0><a-z->d;
+
+        # Apparently pyzor accommodates messages that put the transfer
+        # encoding in the Content-Type.
+        if ( grep { $_ eq $encoding } _QUOTED_PRINTABLE_NAMES() ) {
+            $checkval = Encode::FB_CROAK();
+        }
+    }
+    else {
+        $encoding = 'ascii';
+    }
+
+    # Match Python .decode()’s 'ignore' behavior
+    $checkval ||= \&_encode_check_ignore;
+
+    return ( $main, $sub, $encoding, $checkval );
+}
+
+#----------------------------------------------------------------------
+
+=head2 @lines = splitlines( $TEXT )
+
+Imitates C<str.splitlines()>. (cf. C<pydoc str>)
+
+Returns a plain list in list context. Returns the number of
+items to be returned in scalar context.
+
+=cut
+
+sub splitlines {
+    return split m<\r\n?|\n>, $_[0];
+}
+
+1;
diff --git a/lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm 
b/lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm
new file mode 100644
index 0000000..2617b4a
--- /dev/null
+++ b/lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm
@@ -0,0 +1,177 @@
+package Mail::SpamAssassin::Pyzor::Digest::StripHtml;
+
+# Copyright 2018 cPanel, LLC.
+# All rights reserved.
+# http://cpanel.net
+#
+# <@LICENSE>
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at:
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </@LICENSE>
+#
+
+use strict;
+use warnings;
+
+=encoding utf-8
+
+=head1 NAME
+
+Mail::SpamAssassin::Pyzor::Digest::StripHtml
+
+=head1 SYNOPSIS
+
+    my $stripped = Mail::SpamAssassin::Pyzor::Digest::StripHtml::strip($html);
+
+=head1 DESCRIPTION
+
+This module attempts to duplicate pyzor’s HTML-stripping logic.
+
+=head1 ACCURACY
+
+This library cannot achieve 100%, bug-for-bug parity with pyzor
+because to do so would require duplicating Python’s own HTML parsing
+library. Since that library’s output has changed over time, and those
+changes in turn affect pyzor, it’s literally impossible to arrive at
+a single, fully-compatible reimplementation.
+
+That said, all known divergences between pyzor and this library involve
+invalid HTML as input.
+
+Please open bug reports for any divergences you identify, particularly
+if the input is valid HTML.
+
+=cut
+
+#----------------------------------------------------------------------
+
+use HTML::Parser ();
+
+our $VERSION = '0.03';
+
+#----------------------------------------------------------------------
+
+=head1 FUNCTIONS
+
+=head2 $stripped = strip( $HTML )
+
+Give it some HTML, and it’ll give back the stripped text.
+
+In B<general>, the stripping consists of removing tags as well as
+C<E<lt>scriptE<gt>> and C<E<lt>styleE<gt>> elements; however, it also
+removes HTML entities.
+
+This tries very hard to duplicate pyzor’s behavior with invalid HTML.
+
+=cut
+
+sub strip {
+    my ($html) = @_;
+
+    $html =~ s<\A\s+><>;
+    $html =~ s<\s+\z><>;
+
+    my $p = HTML::Parser->new( api_version => 3 );
+
+    my @pieces;
+
+    my $accumulate = 1;
+
+    $p->handler(
+        start => sub {
+            my ($tagname) = @_;
+
+            $accumulate = 0 if $tagname eq 'script';
+            $accumulate = 0 if $tagname eq 'style';
+
+            return;
+        },
+        'tagname',
+    );
+
+    $p->handler(
+        end => sub {
+            $accumulate = 1;
+            return;
+        }
+    );
+
+    $p->handler(
+        text => sub {
+            my ($copy) = @_;
+
+            return if !$accumulate;
+
+            # pyzor’s HTML parser discards HTML entities. On top of that,
+            # we need to match, as closely as possible, pyzor’s handling of
+            # invalid HTML entities … which is a function of Python’s
+            # standard HTML parsing library. This will probably never be
+            # fully compatible with the pyzor, but we can get it close.
+
+            # The original is:
+            #
+            #   re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
+            #
+            # The parsing loop then “backs up” one byte if the last
+            # character isn’t a “;”. We use a look-ahead assertion to
+            # mimic that behavior.
+            $copy =~ s<\&\# (?:[0-9]+ | [xX][0-9a-fA-F]+) (?: ; | \z | 
(?=[^0-9a-fA-F]) )>< >gx;
+
+            # The original is:
+            #
+            #   re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
+            #
+            # We again use a look-ahead assertion to mimic Python.
+            $copy =~ s<\& [a-zA-Z] [-.a-zA-Z0-9]* (?: ; | \z | 
(?=[^a-zA-Z0-9]) )>< >gx;
+
+            # Python’s HTMLParser aborts its parsing loop when it encounters
+            # an invalid numeric reference.
+            $copy =~ s<\&\#
+                (?:
+                    [^0-9xX]        # anything but the expected first char
+                    |
+                    [0-9]+[a-fA-F]  # hex within decimal
+                    |
+                    [xX][^0-9a-fA-F]
+                )
+                (.*)
+            ><
+                ( -1 == index($1, ';') ) ? q<> : '&#'
+            >exs;
+
+            # Python’s HTMLParser treats invalid entities as incomplete
+            $copy =~ s<(\&\#?)><$1 >gx;
+
+            $copy =~ s<\A\s+><>;
+            $copy =~ s<\s+\z><>;
+
+            push @pieces, \$copy if length $copy;
+        },
+        'text,tagname',
+    );
+
+    $p->parse($html);
+    $p->eof();
+
+    my $payload = join( q< >, map { $$_ } @pieces );
+
+    # Convert all sequences of whitespace OTHER THAN non-breaking spaces to
+    # plain spaces.
+    $payload =~ s<[^\S\x{a0}]+>< >g;
+
+    return $payload;
+}
+
+1;
diff --git a/t/pyzor.t b/t/pyzor.t
index 891f38d..e4ef83f 100755
--- a/t/pyzor.t
+++ b/t/pyzor.t
@@ -3,12 +3,9 @@
 use lib '.'; use lib 't';
 use SATest; sa_t_init("pyzor");
 
-use constant HAS_PYZOR => eval { $_ = untaint_cmd("which pyzor"); chomp; -x };
-
 use Test::More;
 plan skip_all => "Net tests disabled" unless conf_bool('run_net_tests');
-plan skip_all => "Pyzor executable not found in path" unless HAS_PYZOR;
-plan tests => 8;
+plan tests => 5;
 
 diag('Note: Failures may not be an SpamAssassin bug, as Pyzor tests can fail 
due to problems with the Pyzor servers.');
 
@@ -30,7 +27,7 @@ tstprefs ("
 sarun ("-t < data/spam/pyzor", \&patterns_run_cb);
 ok_all_patterns();
 # Same with fork
-sarun ("--cf='pyzor_fork 1' -t < data/spam/pyzor", \&patterns_run_cb);
+sarun ("-t < data/spam/pyzor", \&patterns_run_cb);
 ok_all_patterns();
 
 #TESTING FOR HAM
@@ -44,7 +41,3 @@ ok_all_patterns();
 
 sarun ("-D pyzor -t < data/nice/001 2>&1", \&patterns_run_cb);
 ok_all_patterns();
-# same with fork
-sarun ("-D pyzor --cf='pyzor_fork 1' -t < data/nice/001 2>&1", 
\&patterns_run_cb);
-ok_all_patterns();
-

Attachment: signature.asc
Description: PGP signature

Reply via email to