Patch 1/4

This patch adds a "failurecount" option to ldirectord.

This implements a global or per-virtual service option indicating how
many successive check failures must occur before ldirectord will remove
(or quiesce) the real server.  A check success will reset the counter of
failures.

We have a handful of special-case hosts that occasionally (and somewhat
intentionally) hit high load and timeout on a check.  This option gives
us some additional configuration and checking flexibility and lets us
monitor the status of the hosts closely without being "twitchy" in our
reaction.

Thanks,
Sean
# HG changeset patch
# User Sean E. Millichamp <[email protected]>
# Date 1242067452 14400
# Node ID 74cca79b94440d95b1b2891e50f05f7f4afd3116
# Parent  9c12612aba5ea19eed0dc8b897f404f7a866193f
Adds "failurecount" option to ldirectord.

This implements a global or per-virtual service option indicating how many
successive check failures must occur before ldirectord will remove (or quiesce)
the real server.  A check success will reset the counter of failures.

Signed-Off-By: Sean E. Millichamp <[email protected]>

diff -r 9c12612aba5e -r 74cca79b9444 ldirectord/ldirectord.in
--- a/ldirectord/ldirectord.in	Mon May 11 13:53:08 2009 +0200
+++ b/ldirectord/ldirectord.in	Mon May 11 14:44:12 2009 -0400
@@ -165,6 +165,17 @@
 
 Default: 1
 
+B<failurecount = >I<n>
+
+The number of consecutive times a failure will have to be reported by a
+check before the realserver is removed from the kernel's LVS table.  A 
+value of 1 will have the realserver removed on the first failure.  A
+successful check will reset the failure counter to 0.
+
+If defined in a virtual server section then the global value is overriden.
+
+Default: 1
+
 B<autoreload = >B<yes>|B<no>
 
 Defines if <ldirectord> should continuously check the configuration file
@@ -664,6 +675,7 @@
 	    $CHECKTIMEOUT
 	    $DEFAULT_CHECKTIMEOUT
 	    $CHECKCOUNT
+	    $FAILURECOUNT
 	    $QUIESCENT
 	    $FORKING
 	    $EMAILALERT
@@ -730,6 +742,7 @@
 $DEFAULT_CHECKTIMEOUT     = 5;
 $CHECKTIMEOUT     = -1;
 $CHECKCOUNT       = 1;
+$FAILURECOUNT     = 1;
 $LDIRECTORD       = ld_find_cmd("ldirectord", 1);
 if (! defined $LDIRECTORD) {
 	$LDIRECTORD = "@sbindir@/ldirectord";
@@ -1225,6 +1238,7 @@
 			$vsrv{checktimeout} = -1;
 			$vsrv{checkcount} = -1;
 			$vsrv{negotiatetimeout} = -1;
+			$vsrv{failurecount} = -1;
 			$vsrv{num_connects} = 0;
 			$vsrv{httpmethod} = "GET";
 			$vsrv{secret} = "";
@@ -1279,6 +1293,9 @@
 				} elsif ($rcmd =~ /^checkcount\s*=\s*(.*)/){
                                         $1 =~ /(\d+)/ && $1 or &config_error($line, "invalid check count");
                                         $vsrv{checkcount} = $1;
+				} elsif ($rcmd =~ /^failurecount\s*=\s*(.*)/){
+                                        $1 =~ /(\d+)/ && $1 or &config_error($line, "invalid failure count");
+                                        $vsrv{failurecount} = $1;
 				} elsif ($rcmd =~ /^checkinterval\s*=\s*(.*)/){
                                         $1 =~ /(\d+)/ && $1 or &config_error($line, "invalid checkinterval");
                                         $vsrv{checkinterval} = $1
@@ -1487,6 +1504,10 @@
 			$1 =~ /(\d+)/ && $1 or &config_error($line, 
 					"invalid check count value");
 			$CHECKCOUNT = $1;
+		} elsif ($linedata  =~ /^failurecount\s*=\s*(.*)/) {
+			$1 =~ /(\d+)/ && $1 or &config_error($line, 
+					"invalid failure count value");
+			$FAILURECOUNT = $1;
 		} elsif ($linedata  =~ /^fallback\s*=\s*(.*)/) {
 			my $tcp = parse_fallback($line, $1, undef);
 			my $udp = parse_fallback($line, $1, undef);
@@ -2119,6 +2140,10 @@
 		if ($$v{checkcount} < 0) {
 			$$v{checkcount} = $CHECKCOUNT;
 		}
+
+		if ($$v{failurecount} < 0) {
+			$$v{failurecount} = $FAILURECOUNT;
+		}
 	}
 }
 
@@ -3720,10 +3745,12 @@
 }
 
 
-# Set the status of a server as up
-# Should only be called from _service_up or _ld_start
-
-sub _status_up
+
+# Check the status of a server
+# Should only be called from _status_up, _status_down,
+# _service_up, or _service_down
+# Returns 1 if the server is up, 0 if down
+sub _status_check
 {
 	my ($v, $r, $is_fallback) = (@_);
 
@@ -3734,15 +3761,30 @@
 		if (defined($v->{real_status}) or
 				(defined($v->{fallback_status}) and
 				$v->{fallback_status}->{"$real_id"})) {
-			return undef;
+			return 1;
 		}
 	}
 	else {
 		if (defined ($v->{real_status}) and
 				$v->{real_status}->{"$real_id"}) {
-			return undef;
-		}
-	}
+			return 1;
+		}
+	}
+	return 0;
+}
+
+
+# Set the status of a server as up
+# Should only be called from _service_up or _ld_start
+
+sub _status_up
+{
+	my ($v, $r, $is_fallback) = (@_);
+
+	my $virtual_id = get_virtual_id_str($v);
+	my $real_id = get_real_id_str($r, $v);
+
+	return undef if(_status_check($v, $r, $is_fallback));
 
 	$r->{virtual_status}->{"$virtual_id"} = 1;
 	if (defined $is_fallback) {
@@ -3765,19 +3807,7 @@
 	my $virtual_id = get_virtual_id_str($v);
 	my $real_id = get_real_id_str($r, $v);
 
-	if (defined($is_fallback)) {
-		if (! defined($v->{real_status}) and
-				 (! defined($v->{fallback_status}) or
-				  !$v->{fallback_status}->{"$real_id"})) {
-			return undef;
-		}
-	}
-	else {
-		if (! defined ($v->{real_status}) or
-				! $v->{real_status}->{"$real_id"}) {
-			return undef;
-		}
-	}
+	return undef if (!_status_check($v, $r, $is_fallback));
 
 	if (defined($is_fallback)) {
 		delete $v->{fallback_status}->{"$real_id"};
@@ -3819,6 +3849,13 @@
 {
 	my ($v, $r, $force) = (@_);
 
+	if ($r->{failcount} > 0) {
+		ld_log("Resetting soft failure count: " . $r->{server} . ":" .
+		       $r->{port} . " (" . get_virtual_id_str($v) . ")");
+	}
+
+	$r->{failcount} = 0;
+
 	if (! _status_up($v, $r) and ! defined($force)) {
 		return;
 	}
@@ -3845,9 +3882,21 @@
 {
 	my ($v, $r, $force) = @_;
 
-	if (! _status_down($v, $r) and ! defined($force)) {
-		return;
-	}
+	if (!_status_check($v, $r) and !defined($force)) {
+		return;
+	}
+
+	$r->{failcount}++;
+
+	if (!defined($force) and _status_check($v, $r) and
+	     ($r->{failcount} < $v->{failurecount})) {
+		ld_log("Soft failure real server: " . $r->{server} . ":" .
+		       $r->{port} . " (" . get_virtual_id_str($v) .
+		       ") failure " . $r->{failcount} . "/" . $v->{failurecount});
+		return;
+	}
+
+	_status_down($v, $r);
 
         &_remove_service($v, $r->{server} . ":" . $r->{port}, 
                           $r->{forw}, "real");
_______________________________________________
Please read the documentation before posting - it's available at:
http://www.linuxvirtualserver.org/

LinuxVirtualServer.org mailing list - [email protected]
Send requests to [email protected]
or go to http://lists.graemef.net/mailman/listinfo/lvs-users

Reply via email to