Title: [opsview] [12046] Account for race condition when checking Opsview slave cluster health
Revision
12046
Author
dferguson
Date
2013-04-11 16:53:09 +0100 (Thu, 11 Apr 2013)

Log Message

Account for race condition when checking Opsview slave cluster health

Nagios periodically writes out status.dat.  If cluster check runs as the file is being written you get a 'Cannot open status log for reading' error.  When a customer alerts on SOFT errors this can be seen fairly regularly.

Modified Paths

Modified: trunk/opsview-core/nagios-plugins/check_opsview_slave_cluster
===================================================================
--- trunk/opsview-core/nagios-plugins/check_opsview_slave_cluster	2013-04-11 14:57:41 UTC (rev 12045)
+++ trunk/opsview-core/nagios-plugins/check_opsview_slave_cluster	2013-04-11 15:53:09 UTC (rev 12046)
@@ -63,18 +63,49 @@
       . ' -e 60 -v -v'
 );
 warn "cmd: ", join( ' ', @cmd ), $/ if ( $np->opts->verbose );
-open F, "-|", @cmd or $np->nagios_exit( CRITICAL, "Cannot run ssh command" );
-my $info;
-{
+my ( $info, $rc ) = run_command(@cmd);
 
-    # Grab all lines and convert linefeeds to \n
-    local $/ = undef;
-    $info = <F>;
-    $info =~ s/\n/\\n/g
-};
-$info =~ s/\\n$//;
-close F or $np->nagios_exit( CRITICAL, "Error: $info ($?)" );
-
 warn "info: ", $info, $/ if ( $np->opts->verbose );
 
+# NOTE: there is a race condition here when nagios is writing out status.dat
+# so give it chance and try again
+if ( $info =~ m/Cannot open status log for reading/ ) {
+    my $delay = 5;
+    if ( $np->opts->verbose ) {
+        warn
+          "status.dat not available - sleeping for $delay seconds and trying again",
+          $/;
+    }
+    sleep $delay;
+    ( $info, $rc ) = run_command(@cmd);
+    warn "info: ", $info, $/ if ( $np->opts->verbose );
+}
+
+if ($rc) {
+    $np->nagios_exit( CRITICAL, "Error: $info ($?)" );
+}
+
 $np->nagios_exit( OK, "" );
+
+sub run_command {
+    my (@cmd) = @_;
+
+    open F, "-|", @cmd
+      or $np->nagios_exit( CRITICAL, "Cannot run ssh command" );
+    my $info;
+    {
+
+        # Grab all lines and convert linefeeds to \n
+        local $/ = undef;
+        $info = <F>;
+        $info =~ s/\n/\\n/g
+    };
+    $info =~ s/\\n$//;
+    close F; # or $np->nagios_exit( CRITICAL, "Error: $info ($?)" );
+    my $rc = $?;
+
+    #die "info=$info";
+
+    return ( $info, $rc );
+
+}

_______________________________________________
Opsview-checkins mailing list
Opsview-checkins@lists.opsview.org
http://lists.opsview.org/lists/listinfo/opsview-checkins

Reply via email to