Title: [opsview] [12046] Account for race condition when checking Opsview slave cluster health
- Revision
- 12046
- Author
- dferguson
- Date
- 2013-04-11 16:53:09 +0100 (Thu, 11 Apr 2013)
Log Message
Account for race condition when checking Opsview slave cluster health
Nagios periodically writes out status.dat. If cluster check runs as the file is being written you get a 'Cannot open status log for reading' error. When a customer alerts on SOFT errors this can be seen fairly regularly.
Modified Paths
Modified: trunk/opsview-core/nagios-plugins/check_opsview_slave_cluster
===================================================================
--- trunk/opsview-core/nagios-plugins/check_opsview_slave_cluster 2013-04-11 14:57:41 UTC (rev 12045)
+++ trunk/opsview-core/nagios-plugins/check_opsview_slave_cluster 2013-04-11 15:53:09 UTC (rev 12046)
@@ -63,18 +63,49 @@
. ' -e 60 -v -v'
);
warn "cmd: ", join( ' ', @cmd ), $/ if ( $np->opts->verbose );
-open F, "-|", @cmd or $np->nagios_exit( CRITICAL, "Cannot run ssh command" );
-my $info;
-{
+my ( $info, $rc ) = run_command(@cmd);
- # Grab all lines and convert linefeeds to \n
- local $/ = undef;
- $info = <F>;
- $info =~ s/\n/\\n/g
-};
-$info =~ s/\\n$//;
-close F or $np->nagios_exit( CRITICAL, "Error: $info ($?)" );
-
warn "info: ", $info, $/ if ( $np->opts->verbose );
+# NOTE: there is a race condition here when nagios is writing out status.dat
+# so give it chance and try again
+if ( $info =~ m/Cannot open status log for reading/ ) {
+ my $delay = 5;
+ if ( $np->opts->verbose ) {
+ warn
+ "status.dat not available - sleeping for $delay seconds and trying again",
+ $/;
+ }
+ sleep $delay;
+ ( $info, $rc ) = run_command(@cmd);
+ warn "info: ", $info, $/ if ( $np->opts->verbose );
+}
+
+if ($rc) {
+ $np->nagios_exit( CRITICAL, "Error: $info ($?)" );
+}
+
$np->nagios_exit( OK, "" );
+
+sub run_command {
+ my (@cmd) = @_;
+
+ open F, "-|", @cmd
+ or $np->nagios_exit( CRITICAL, "Cannot run ssh command" );
+ my $info;
+ {
+
+ # Grab all lines and convert linefeeds to \n
+ local $/ = undef;
+ $info = <F>;
+ $info =~ s/\n/\\n/g
+ };
+ $info =~ s/\\n$//;
+ close F; # or $np->nagios_exit( CRITICAL, "Error: $info ($?)" );
+ my $rc = $?;
+
+ #die "info=$info";
+
+ return ( $info, $rc );
+
+}
_______________________________________________
Opsview-checkins mailing list
Opsview-checkins@lists.opsview.org
http://lists.opsview.org/lists/listinfo/opsview-checkins