[PATCH] replace call to file(1) with libmagic

Markus Benning Tue, 22 Jul 2014 01:35:22 -0700

Hello Mark,

i did some research on optimizing Amavis/Spamassassin in our test setup
with NYTProf.
I found that replacing the call to the external file(1) tool with
calls to File::LibMagic increased the throughput from 4.1 to 4.8 msg/s.
It also simplifies the code in determine_file_types().


 Markus

--- amavisd.patched-ctch	2014-07-10 15:01:01.162732554 +0200
+++ amavisd.libmagic	2014-07-10 15:51:04.054289054 +0200
@@ -27318,7 +27318,7 @@
 
 BEGIN {
   require Exporter;
-  use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION);
+  use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION $magic);
   $VERSION = '2.316';
   @ISA = qw(Exporter);
   @EXPORT_OK = qw(&init &decompose_part &determine_file_types);
@@ -27331,6 +27331,9 @@
   import Amavis::Lookup qw(lookup lookup2);
   import Amavis::Unpackers::MIME qw(mime_decode);
   import Amavis::Unpackers::NewFilename qw(consumed_bytes);
+
+  use File::LibMagic;
+  $magic = File::LibMagic->new;
 }
 use subs @EXPORT_OK;
 
@@ -27431,52 +27434,15 @@
 # call 'file(1)' utility for each part,
 # and associate (save) full and short file content types with each part
 #
+
 sub determine_file_types($$) {
   my($tempdir, $partslist_ref) = @_;
-  defined $file && $file ne ''
-    or die "Unix utility file(1) not available, but is needed";
   my(@all_part_list) = grep($_->exists, @$partslist_ref);
   my $initial_num_parts = scalar(@all_part_list);
-  my $cwd = "$tempdir/parts";
-  if (@all_part_list) { chdir($cwd) or die "Can't chdir to $cwd: $!" }
-  my($proc_fh,$pid); my $eval_stat;
-  eval {
-    while (@all_part_list) {
-      my(@part_list,@file_list); # collect reasonably small subset of filenames
-      my $arglist_size = length($file);  # size of a command name itself
-      while (@all_part_list) {   # collect as many args as safe, at least one
-        my $nm = $all_part_list[0]->full_name;
-        local($1); $nm =~ s{^\Q$cwd\E/(.*)\z}{$1}s;  # remove cwd from filename
-        # POSIX requires 4 kB as a minimum buffer size for program arguments
-        last  if @file_list && $arglist_size + length($nm) + 1 > 4000;
-        push(@part_list, shift(@all_part_list));     # swallow the next one
-        push(@file_list, $nm);  $arglist_size += length($nm) + 1;
-      }
-      if (scalar(@file_list) < $initial_num_parts) {
-        do_log(2, "running file(1) on %d (out of %d) files, arglist size %d",
-                   scalar(@file_list), $initial_num_parts, $arglist_size);
-      } else {
-        do_log(5, "running file(1) on %d files, arglist size %d",
-                   scalar(@file_list), $arglist_size);
-      }
-      ($proc_fh,$pid) = run_command(undef, '&1', $file, @file_list);
-      my $index = 0; my $ln;
-      for ($! = 0; defined($ln=$proc_fh->getline); $! = 0) {
-        do_log(5, "result line from file(1): %s", $ln);
-        chomp($ln); local($1,$2);
-        if ($index > $#file_list) {
-          do_log(-1,"NOTICE: Skipping unexpected output from file(1): %s",$ln);
-        } else {
-          my $part   = $part_list[$index];  # walk through @part_list in sync
-          my $expect = $file_list[$index];  # walk through @file_list in sync
-          if ($ln !~ /^(\Q$expect\E):[ \t]*(.*)\z/s) {
-            # split file name from type
-            do_log(-1,"NOTICE: Skipping bad output from file(1) ".
-                      "at [%d, %s], got: %s", $index,$expect,$ln);
-          } else {
-            my $type_short; my $actual_name = $1; my $type_long = $2;
-            $type_short =
-              lookup2(0,$type_long,\@map_full_type_to_short_type_maps);
+
+  foreach my $part ( @all_part_list ) {
+            my $type_long = $magic->describe_filename($part->full_name);
+            my $type_short = lookup2(0,$type_long,\@map_full_type_to_short_type_maps);
             ll(4) && do_log(4, "File-type of %s: %s%s",
                                $part->base_name, $type_long,
                                (!defined $type_short ? ''
@@ -27487,40 +27453,8 @@
             $part->attributes_add('C')    # simpleminded
               if !ref($type_short) ? $type_short eq 'pgp'  # encrypted?
                                    : grep($_ eq 'pgp', @$type_short);
-            $index++;
-          }
-        }
-      }
-      defined $ln || $! == 0 || $! == EAGAIN
-        or die "Error reading from file(1) utility: $!";
-      do_log(-1,"unexpected(file): %s",$!)  if !defined($ln) && $! == EAGAIN;
-      my $err = 0; $proc_fh->close or $err = $!;
-      my $child_stat = defined $pid && waitpid($pid,0) > 0 ? $? : undef;
-      undef $proc_fh; undef $pid; my(@errmsg);
-      # exit status is 1 when result is 'ERROR: ...', accept it mercifully
-      proc_status_ok($child_stat,$err, 0,1)
-        or push(@errmsg, "failed, ".exit_status_str($child_stat,$err));
-      if ($index < @part_list) {
-        push(@errmsg, sprintf("parsing failure - missing last %d results",
-                              @part_list - $index));
-      }
-      !@errmsg  or die join(", ",@errmsg);
-      # even though exit status 1 is accepted, log a warning nevertheless
-      proc_status_ok($child_stat,$err)
-        or do_log(-1, "file utility failed: %s",
-                       exit_status_str($child_stat,$err));
-    }
-    1;
-  } or do {
-    $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
-    kill_proc($pid,$file,1,$proc_fh,$eval_stat)  if defined $pid;
-  };
-  chdir($tempdir) or die "Can't chdir to $tempdir: $!";
-  section_time(sprintf('get-file-type%d', $initial_num_parts));
-  if (defined $eval_stat) {
-    do_log(-2, "file(1) utility (%s) FAILED: %s", $file,$eval_stat);
-  # die "file(1) utility ($file) error: $eval_stat";
   }
+  section_time(sprintf('get-file-type%d', $initial_num_parts));
 }
 
 sub decompose_mail($$) {

signature.asc
Description: Digital signature

[PATCH] replace call to file(1) with libmagic

Reply via email to