Package: munin-node
Version: 1.4.5-3
Severity: normal
Tags: patch

hddtemp_smartctl has numerous bugs and annoymances e.g.:

. Fails to collect anything on systems with large numbers of drives due
to timeouts.

. Fails to collect anything on systems with one or more faulty or slow
drives due to timeouts.

. Fails to collect anything on busy (e.g. I/O) systems due to timeouts

. Arbitrarily sets graph minimum to zero celsius - I have three drives
here which say:

Min/Max recommended Temperature:     -4/72 Celsius
Min/Max Temperature Limit:           -9/77 Celsius

and five which say:

Min/Max recommended Temperature:      0/60 Celsius
Min/Max Temperature Limit:           -40/70 Celsius

. Doesn't query temperature limits from drives

. On Linux systems only the first 26 drives are checked (e.g. not /dev/sdaa,
/dev/sdab etc.).

. Fails to detect smartctl '--nocheck' option on Squeeze+

I think this patch fixes those bugs sufficiently to get it working on my
system. 

Cheers,

Tim.



--- /usr/share/munin/plugins/hddtemp_smartctl   2010-10-05 14:39:07.000000000 
+0100
+++ hddtemp_smartctlmt  2011-07-31 14:01:23.415941442 +0100
@@ -23,6 +23,13 @@
              not return its temperature when only the -A option is
              used.
  dev_$dev  - monitoring device for one drive, e.g. twe0
+ maxchild  - Maximum number of smartctl child processes to run at any
+            one time - defaults to 4
+ minstartdelaysecs   - Time in seconds to wait between spawning child
+                       processes, defaults to 0.1
+ smartctltimeoutsecs - Maximum time to wait for a child process to run
+                      in case of a broken drive etc.  Defaults to 5
+            
 
 If the "smartctl" enviroment variable is not set the plugin will
 search your $PATH, /usr/bin, /usr/sbin, /usr/local/bin and
@@ -89,11 +96,29 @@
 smartctl then hdparm will be used.  Note that hdparm isn't available
 on all platforms.
 
+=head1 BUGS
+
+  * Uses part of the device name as the munin data source fieldname,
+    which can and does frequently change.  Should be derived from the
+    drive model+serial number instead.
+  * All modern drives make their recommended/absolute min/max
+    rated temperatures available - see 'smartctl -l scttemp' this
+    info should be provided by this plugin when run with the "config"
+    argument.
+
 =cut
 
 use strict;
+use Errno;
+use POSIX;
+use IO::File;
 
+sub pipe_from_fork ($);
 my $DEBUG = $ENV{'MUNIN_DEBUG'} || 0;
+my $maxchild = $ENV{'maxchild'} || 4;
+my $minstartdelaysecs = $ENV{'minstartdelaysecs'} || 0.1;
+my $smartctltimeoutsecs = $ENV{'timeout'} || 5;
+
 
 my $smartctl = exists $ENV{smartctl} ? $ENV{smartctl} : '';
 
@@ -116,8 +141,8 @@
 
 # Check version of smartctl to determine --nocheck capabilities
 my $use_nocheck = 0;
-if ($smartctl and `$smartctl --version` =~ / version (\d+\.\d+) /i) {
-    $use_nocheck = $1 >= 5.37;
+if ($smartctl and `$smartctl --help` =~ /nocheck/) {
+    $use_nocheck = 1;
     warn "[DEBUG] Smartctl supports --nocheck\n" if $DEBUG;
 }
 
@@ -141,7 +166,7 @@
   my @drivesSCSI;
   if (-d '/sys/block/') {
     opendir(SCSI, '/sys/block/');
-    @drivesSCSI = grep /sd[a-z]/, readdir SCSI;
+    @drivesSCSI = grep /sd[a-z]+/, readdir SCSI;
     closedir(SCSI);
    }
 
@@ -185,7 +210,6 @@
     }
   } elsif ($ARGV[0] eq 'config') {
     print "graph_title HDD temperature\n";
-    print "graph_args --base 1000 -l 0\n";
     print "graph_vlabel temp in °C\n";
     print "graph_category sensors\n";
     print "graph_info This graph shows the temperature in degrees Celsius of 
the hard drives in the machine.\n";
@@ -194,7 +218,14 @@
   }
 }
 
-foreach my $drive (@drives) {
+my %smartreaders;
+my @remainingdrives = @drives;
+
+while (@remainingdrives || keys %smartreaders) {
+  # Fire-up another smartctl?
+  if (@remainingdrives && ((keys %smartreaders) < $maxchild)) { 
+
+    my $drive = shift @remainingdrives;
   warn "[DEBUG] Processing $drive\n" if $DEBUG;
   my $fulldev = device_for_drive($drive);
 
@@ -209,29 +240,88 @@
   }
 
   my $cmd = command_for_drive_device($drive, $fulldev, $use_nocheck);
-  warn "[DEBUG] Command for $drive is % $cmd %\n" if $DEBUG;
+    $smartreaders{$drive}{'cmd'} = $cmd;
 
-  my $output = `$cmd`;
-  if ($? ne 0) {
-      print "$drive.value U\n";
-      print "$drive.extinfo Command $cmd on drive $drive failed: $?.  The 
plugin needs to have read permission on all monitored devices.\n";
-      warn "[ERROR] Command $cmd on drive $drive failed: $?.  The plugin needs 
to have read permission on all monitored devices.\n";
-      next;
+    # This did use perl's "open |-" syntax, but this didn't work in 5.8.0 :-(
+    $smartreaders{$drive}{'fh'} = IO::File->new;
+    $smartreaders{$drive}{'pid'} = pipe_from_fork($smartreaders{$drive}{'fh'});
+
+    if ($smartreaders{$drive}{'pid'}) { # parent process
+      # switch to non-blocking reading of the pipe
+      use Fcntl;
+      my $flags = 0;
+      fcntl($smartreaders{$drive}{'fh'}, F_GETFL, $flags) or die "Couldn't get 
flags for $smartreaders{$drive}{'fh'} : $!\n";
+      $flags |= O_NONBLOCK;
+      fcntl($smartreaders{$drive}{'fh'}, F_SETFL, $flags) or die "Couldn't set 
flags for $smartreaders{$drive}{'fh'}: $!\n";
+      $smartreaders{$drive}{'output'} = '';
+    } else { # child process - read with timeout.
+      close STDIN;
+      close STDERR;
+      eval {
+        local $SIG{ALRM} = sub { die "TIMEOUT: $cmd" };
+        alarm $smartctltimeoutsecs;
+        #print "Current Drive Temperature: 42\n";
+        print `$cmd`;
+      };
+      if ($@ and $@ =~ /^TIMEOUT/) { print "TIMEOUT: $cmd"; exit 1 }
+      exit 0;
+    }
+
+  }
+  # Collect any waiting data from the smartctl child processes
+  foreach my $driveworker (keys %smartreaders) {
+    my $rv = sysread($smartreaders{$driveworker}{'fh'}, my $cmdoutput, 1024);
+    if (!defined($rv)) {
+      if ($! != Errno::EAGAIN) {
+        warn "[ERROR] Failed to read from child process for $driveworker: $?";
+        delete $smartreaders{$driveworker};
+      }
+    } else {
+      if ($rv > 0) {
+        $smartreaders{$driveworker}{'output'} .= $cmdoutput;
+      } else { # EOF on pipe...
+        close $smartreaders{$driveworker}{'fh'};
+        POSIX::waitpid($smartreaders{$driveworker}{'pid'}, 0);
+        my $rc = $?;
+        if (WIFSIGNALED($rc)) {
+          warn "Child process existed with signal\n";
   }
-  if ($output =~ /Current Drive Temperature:\s*(\d+)/) {
-    print "$drive.value $1\n";
-  } elsif ($output =~ /^(194 Temperature_Celsius.*)/m) {
+        if (WIFEXITED($rc)) {
+          #print "exit status: " . WEXITSTATUS($rc) . "\n";
+        } else {
+          print "wifexited returned false - exit status: " . WEXITSTATUS($rc) 
. "\n";
+        }
+        if (WIFEXITED($rc) && (WEXITSTATUS($rc) != 0)) {
+          my $errmsg;
+          print "$driveworker.value U\n";
+          warn $smartreaders{$driveworker}{'output'} if $DEBUG;
+          if ($smartreaders{$driveworker}{'output'} =~ /^TIMEOUT/) {
+            $errmsg = "$driveworker timed out after $smartctltimeoutsecs 
seconds: '" . $smartreaders{$driveworker}{'cmd'} . "'\n";
+          } else {
+            $errmsg = "'" . $smartreaders{$driveworker}{'cmd'} . "' on drive 
$driveworker failed with exit status: " . WEXITSTATUS($?) . ".  The plugin 
needs to have read permission on all monitored devices.\n";
+          }
+          print "$driveworker.extinfo $errmsg";
+          warn "[ERROR] $errmsg";
+        } else {
+          if ($smartreaders{$driveworker}{'output'} =~ /Current Drive 
Temperature:\s*(\d+)/) {
+            print "$driveworker.value $1\n";
+          } elsif ($smartreaders{$driveworker}{'output'} =~ /^(194 
Temperature_Celsius.*)/m) {
     my @F = split /\s+/, $1;
-    print "$drive.value $F[9]\n";
-  } elsif ($output =~ /^(231 Temperature_Celsius.*)/m) {
+            print "$driveworker.value $F[9]\n";
+          } elsif ($smartreaders{$driveworker}{'output'} =~ /^(231 
Temperature_Celsius.*)/m) {
     my @F = split ' ', $1;
-    print "$drive.value $F[9]\n";
+            print "$driveworker.value $F[9]\n";
   } else {
-      print "$drive.value U\n";
-      print "$drive.extinfo Temperature not detected in smartctl output\n";
+            print "$driveworker.value U\n";
+            print "$driveworker.extinfo Temperature not detected in smartctl 
output\n";
   }
 }
-
+        delete $smartreaders{$driveworker};
+      }
+    }
+  }
+  sleep $minstartdelaysecs;
+}
 
 sub device_for_drive {
     my ($drive) = @_;
@@ -256,3 +346,19 @@
 
 }
 
+# simulate open(FOO, "-|"), but in a way that actually gives you access to the
+# exit status of the process on the end of the pipe - from "perldoc perlfork"
+sub pipe_from_fork ($) {
+  my $parent = shift;
+  pipe $parent, my $child or die;
+  my $pid = fork();
+  die "fork() failed: $!" unless defined $pid;
+  if ($pid) {
+    close $child;
+  }
+  else {
+    close $parent;
+    open(STDOUT, ">&=" . fileno($child)) or die;
+  }
+  $pid;
+}


-- System Information:
Debian Release: 6.0.2
  APT prefers stable
  APT policy: (500, 'stable')
Architecture: amd64 (x86_64)

Kernel: Linux 2.6.32-5-openvz-amd64 (SMP w/8 CPU cores)
Locale: LANG=en_GB.UTF-8, LC_CTYPE=en_GB.UTF-8 (charmap=UTF-8)
Shell: /bin/sh linked to /bin/dash

Versions of packages munin-node depends on:
ii  adduser                3.112+nmu2        add and remove users and groups
ii  gawk                   1:3.1.7.dfsg-5    GNU awk, a pattern scanning and pr
ii  libnet-server-perl     0.97-1            An extensible, general perl server
ii  lsb-base               3.2-23.2squeeze1  Linux Standard Base 3.2 init scrip
ii  munin-common           1.4.5-3           network-wide graphing framework (c
ii  perl                   5.10.1-17squeeze2 Larry Wall's Practical Extraction 
ii  procps                 1:3.2.8-9         /proc file system utilities

Versions of packages munin-node recommends:
ii  libnet-snmp-perl              5.2.0-4    Script SNMP connections

Versions of packages munin-node suggests:
ii  acpi                    1.5-2            displays information on ACPI devic
ii  ethtool                 1:2.6.34-3       display or change Ethernet device 
ii  hdparm                  9.32-1           tune hard disk parameters for high
pn  libcache-cache-perl     <none>           (no description available)
ii  libcrypt-ssleay-perl    0.57-2           Support for https protocol in LWP
ii  libdbd-mysql-perl       4.016-1          Perl5 database interface to the My
pn  libdbd-pg-perl          <none>           (no description available)
pn  liblwp-useragent-determ <none>           (no description available)
pn  libnet-irc-perl         <none>           (no description available)
ii  libnet-ssleay-perl      1.36-1           Perl module for Secure Sockets Lay
pn  libtext-csv-xs-perl     <none>           (no description available)
ii  libwww-perl             5.836-1          Perl HTTP/WWW client/server librar
ii  libxml-simple-perl      2.18-3           Perl module for reading and writin
ii  lm-sensors              1:3.1.2-6        utilities to read temperature/volt
ii  logtail                 1.3.13           Print log file lines that have not
ii  munin                   1.4.5-3          network-wide graphing framework (g
pn  munin-java-plugins      <none>           (no description available)
ii  munin-plugins-extra     1.4.5-3          network-wide graphing framework (u
ii  mysql-client            5.1.49-3         MySQL database client (metapackage
ii  mysql-client-5.1 [mysql 5.1.49-3         MySQL database client binaries
ii  net-tools               1.60-23          The NET-3 networking toolkit
ii  python                  2.6.6-3+squeeze6 interactive high-level object-orie
ii  ruby                    4.5              An interpreter of object-oriented 
ii  smartmontools           5.39.1+svn3124-2 control and monitor storage system

-- Configuration Files:
/etc/munin/munin-node.conf changed [not included]
/etc/munin/plugin-conf.d/munin-node [Errno 13] Permission denied: 
u'/etc/munin/plugin-conf.d/munin-node'

-- no debconf information



--
To UNSUBSCRIBE, email to debian-bugs-dist-requ...@lists.debian.org
with a subject of "unsubscribe". Trouble? Contact listmas...@lists.debian.org

Reply via email to