[PATCH] add a maintenance mode to servers

Cyril Bonté Sun, 31 Jan 2010 10:55:26 -0800

Hi willy,
this is a first attempt to add a maintenance mode on servers, using the stat 
socket (in admin level).


It can be done with the following command :
set maintenance <backend>/<server> on (or off to leave the maintenance mode).

In this mode, no more checks will be performed on the server and it will be 
marked as a special DOWN state (MAINT).
If some servers were tracking it, they'll go UP until the server leave the 
maintenance mode.
The stats page and the CSV export also display this special state.

This can be used to disable the server in haproxy before doing some operations 
on this server itself.
This is a good complement to the "http-check disable-on-404" keyword and works 
in TCP mode.

Tell me if it looks OK for you or if it needs some more work.
Thanks ;)

-- 
Cyril Bonté

diff -Naur haproxy-ss-20100129/doc/configuration.txt haproxy-ss-20100129-maintain/doc/configuration.txt
--- haproxy-ss-20100129/doc/configuration.txt	2010-01-28 20:35:13.000000000 +0100
+++ haproxy-ss-20100129-maintain/doc/configuration.txt	2010-01-31 19:13:37.000000000 +0100
@@ -7624,6 +7624,24 @@
   may be specified either by their name or by their numeric ID, prefixed with a
   dash ('#').
 
+set maintenance <backend>/<server> <state>
+  Mark the server DOWN for maintenance. In this mode, no more checks will be
+  performed on the server until it leaves maintenance.
+  If the server is tracked by other servers, those servers will be set to UP
+  during the maintenance.
+
+  Arguments :
+    <backend>/<server> Both the backend and the server may be specified either
+                       by their name or by their numeric ID, prefixed with a
+                       dash ('#').
+
+    <state>            2 values supported :
+                       on  = enter the maintenance mode.
+                       off = leave the maintenance mode.
+
+  This command is restricted and can only be issued on sockets configured for
+  level "admin".
+
 set timeout cli <delay>
   Change the CLI interface timeout for current connection. This can be useful
   during long debugging sessions where the user needs to constantly inspect
Seulement dans haproxy-ss-20100129-maintain: haproxy.sock
diff -Naur haproxy-ss-20100129/include/proto/checks.h haproxy-ss-20100129-maintain/include/proto/checks.h
--- haproxy-ss-20100129/include/proto/checks.h	2010-01-28 20:35:13.000000000 +0100
+++ haproxy-ss-20100129-maintain/include/proto/checks.h	2010-01-31 16:05:36.000000000 +0100
@@ -27,6 +27,8 @@
 
 const char *get_check_status_description(short check_status);
 const char *get_check_status_info(short check_status);
+void set_server_down(struct server *s);
+void set_server_up(struct server *s);
 struct task *process_chk(struct task *t);
 int start_checks();
 void health_adjust(struct server *s, short status);
diff -Naur haproxy-ss-20100129/include/types/server.h haproxy-ss-20100129-maintain/include/types/server.h
--- haproxy-ss-20100129/include/types/server.h	2010-01-28 20:35:13.000000000 +0100
+++ haproxy-ss-20100129-maintain/include/types/server.h	2010-01-31 19:15:33.000000000 +0100
@@ -47,7 +47,7 @@
 #define SRV_CHECKED	0x0010	/* this server needs to be checked */
 #define SRV_GOINGDOWN	0x0020	/* this server says that it's going down (404) */
 #define SRV_WARMINGUP	0x0040	/* this server is warming up after a failure */
-/* unused: 0x0080 */
+#define SRV_MAINTAIN	0x0080	/* this server is in maintenance mode */
 #define SRV_TPROXY_ADDR	0x0100	/* bind to this non-local address to reach this server */
 #define SRV_TPROXY_CIP	0x0200	/* bind to the client's IP address to reach this server */
 #define SRV_TPROXY_CLI	0x0300	/* bind to the client's IP+port to reach this server */
diff -Naur haproxy-ss-20100129/src/checks.c haproxy-ss-20100129-maintain/src/checks.c
--- haproxy-ss-20100129/src/checks.c	2010-01-28 20:35:13.000000000 +0100
+++ haproxy-ss-20100129-maintain/src/checks.c	2010-01-31 19:23:50.000000000 +0100
@@ -359,12 +359,16 @@
  * possible to other servers. It automatically recomputes the number of
  * servers, but not the map.
  */
-static void set_server_down(struct server *s)
+void set_server_down(struct server *s)
 {
 	struct server *srv;
 	struct chunk msg;
 	int xferred;
 
+	if (s->state & SRV_MAINTAIN) {
+		s->health = s->rise;
+	}
+
 	if (s->health == s->rise || s->tracked) {
 		int srv_was_paused = s->state & SRV_GOINGDOWN;
 
@@ -380,14 +384,19 @@
 
 		chunk_init(&msg, trash, sizeof(trash));
 
-		chunk_printf(&msg,
-			"%sServer %s/%s is DOWN", s->state & SRV_BACKUP ? "Backup " : "",
-			s->proxy->id, s->id);
-
-		server_status_printf(&msg, s,
-					((!s->tracked && !(s->proxy->options2 & PR_O2_LOGHCHKS))?SSP_O_HCHK:0),
-					xferred);
-
+		if (s->state & SRV_MAINTAIN) {
+			chunk_printf(&msg,
+				"%sServer %s/%s is DOWN for maintenance", s->state & SRV_BACKUP ? "Backup " : "",
+				s->proxy->id, s->id);
+		} else {
+			chunk_printf(&msg,
+				"%sServer %s/%s is DOWN", s->state & SRV_BACKUP ? "Backup " : "",
+				s->proxy->id, s->id);
+
+			server_status_printf(&msg, s,
+						((!s->tracked && !(s->proxy->options2 & PR_O2_LOGHCHKS))?SSP_O_HCHK:0),
+						xferred);
+		}
 		Warning("%s.\n", trash);
 
 		/* we don't send an alert if the server was previously paused */
@@ -403,18 +412,30 @@
 
 		if (s->state & SRV_CHECKED)
 			for(srv = s->tracknext; srv; srv = srv->tracknext)
-				set_server_down(srv);
+				if (! (srv->state & SRV_MAINTAIN))
+					/* Only notify tracking servers that are not already in maintenance. */
+					if (s->state & SRV_MAINTAIN)
+						/* In case the tracked server goes into maintenance,
+						 * tracking servers go UP as no more checks will be performed.
+						 */
+						set_server_up(srv);
+					else
+						set_server_down(srv);
 	}
 
 	s->health = 0; /* failure */
 }
 
-static void set_server_up(struct server *s) {
-
+void set_server_up(struct server *s) {
+	
 	struct server *srv;
 	struct chunk msg;
 	int xferred;
 
+	if (s->state & SRV_MAINTAIN) {
+		s->health = s->rise;
+	}
+
 	if (s->health == s->rise || s->tracked) {
 		if (s->proxy->srv_bck == 0 && s->proxy->srv_act == 0) {
 			if (s->proxy->last_change < now.tv_sec)		// ignore negative times
@@ -448,20 +469,30 @@
 
 		chunk_init(&msg, trash, sizeof(trash));
 
-		chunk_printf(&msg,
-			"%sServer %s/%s is UP", s->state & SRV_BACKUP ? "Backup " : "",
-			s->proxy->id, s->id);
-
-		server_status_printf(&msg, s,
-					((!s->tracked && !(s->proxy->options2 & PR_O2_LOGHCHKS))?SSP_O_HCHK:0),
-					xferred);
+		if (s->state & SRV_MAINTAIN) {
+			chunk_printf(&msg,
+				"%sServer %s/%s is UP (leaving maintenance)", s->state & SRV_BACKUP ? "Backup " : "",
+				s->proxy->id, s->id);
+		} else {
+			chunk_printf(&msg,
+				"%sServer %s/%s is UP", s->state & SRV_BACKUP ? "Backup " : "",
+				s->proxy->id, s->id);
+
+			server_status_printf(&msg, s,
+						((!s->tracked && !(s->proxy->options2 & PR_O2_LOGHCHKS))?SSP_O_HCHK:0),
+						xferred);
+		}
 
 		Warning("%s.\n", trash);
 		send_log(s->proxy, LOG_NOTICE, "%s.\n", trash);
 
 		if (s->state & SRV_CHECKED)
 			for(srv = s->tracknext; srv; srv = srv->tracknext)
-				set_server_up(srv);
+				if (! (s->state & SRV_MAINTAIN))
+					/* Only notify tracking servers if we're not leaving maintenance. */
+					set_server_up(srv);
+
+		s->state &= ~SRV_MAINTAIN;
 	}
 
 	if (s->health >= s->rise)
@@ -1007,7 +1038,7 @@
 		/* we don't send any health-checks when the proxy is stopped or when
 		 * the server should not be checked.
 		 */
-		if (!(s->state & SRV_CHECKED) || s->proxy->state == PR_STSTOPPED) {
+		if (!(s->state & SRV_CHECKED) || s->proxy->state == PR_STSTOPPED || (s->state & SRV_MAINTAIN)) {
 			while (tick_is_expired(t->expire, now_ms))
 				t->expire = tick_add(t->expire, MS_TO_TICKS(s->inter));
 			return t;
diff -Naur haproxy-ss-20100129/src/dumpstats.c haproxy-ss-20100129-maintain/src/dumpstats.c
--- haproxy-ss-20100129/src/dumpstats.c	2010-01-28 20:35:13.000000000 +0100
+++ haproxy-ss-20100129-maintain/src/dumpstats.c	2010-01-31 19:48:50.000000000 +0100
@@ -498,6 +498,68 @@
 
 			return 1;
 		}
+		else if (strcmp(args[1], "maintenance") == 0) {
+			struct proxy *px;
+			struct server *sv;
+
+			if (s->listener->perm.ux.level < ACCESS_LVL_ADMIN) {
+				s->data_ctx.cli.msg = stats_permission_denied_msg;
+				si->st0 = STAT_CLI_PRINT;
+				return 1;
+			}
+
+			/* split "backend/server" and make <line> point to server */
+			for (line = args[2]; *line; line++)
+				if (*line == '/') {
+					*line++ = '\0';
+					break;
+				}
+
+			if (!*line || !*args[3]) {
+				s->data_ctx.cli.msg = "Require 'backend/server' and 'state'.\n";
+				si->st0 = STAT_CLI_PRINT;
+				return 1;
+			}
+
+			if (!get_backend_server(args[2], line, &px, &sv)) {
+				s->data_ctx.cli.msg = px ? "No such server.\n" : "No such backend.\n";
+				si->st0 = STAT_CLI_PRINT;
+				return 1;
+			}
+
+			if (!strcasecmp(args[3], "on")) {
+			    if (! (sv->state & SRV_MAINTAIN)) {
+				/* Not already in maintenance, we can change the server state */
+				sv->state |= SRV_MAINTAIN;
+				set_server_down(sv);
+			    }
+			}
+			else if (!strcasecmp(args[3], "off")) {
+			    if (sv->state & SRV_MAINTAIN) {
+				/* The server is really in maintenance, we can change the server state */
+				if (sv->tracked) {
+				    /* If this server tracks the status of another one,
+				     * we must restore the good status.
+				     */
+				    if (sv->tracked->state & SRV_RUNNING) {
+					set_server_up(sv);
+				    } else {
+					sv->state &= ~SRV_MAINTAIN;
+					set_server_down(sv);
+				    }
+				} else {
+				    set_server_up(sv);
+				}
+			    }
+			}
+			else {
+				s->data_ctx.cli.msg = "Maintenance state can only by 'on' or 'off'.\n";
+				si->st0 = STAT_CLI_PRINT;
+				return 1;
+			}
+
+			return 1;
+		}
 		else if (strcmp(args[1], "timeout") == 0) {
 			if (strcmp(args[2], "cli") == 0) {
 				unsigned timeout;
@@ -1015,6 +1077,7 @@
 			     ".backup4	{background: #c060ff;}\n"  /* NOLB state shows same as going down */
 			     ".backup5	{background: #90b0e0;}\n"  /* NOLB state shows same as going down */
 			     ".backup6	{background: #e0e0e0;}\n"
+			     ".maintain	{background: #ff6060;font-style: italic;}\n"
 			     ".rls      {letter-spacing: 0.2em; margin-right: 1px;}\n" /* right letter spacing (used for grouping digits) */
 			     "\n"
 			     "a.px:link {color: #ffff40; text-decoration: none;}"
@@ -1083,6 +1146,8 @@
 			     "</tr><tr>\n"
 			     "<td class=\"active0\"></td><td class=\"noborder\">active or backup DOWN &nbsp;</td>"
 			     "<td class=\"active6\"></td><td class=\"noborder\">not checked </td>"
+			     "</tr><tr>\n"
+			     "<td class=\"maintain\"></td><td class=\"noborder\" colspan=\"3\">active or backup DOWN for maintenance (MAINT) &nbsp;</td>"
 			     "</tr></table>\n"
 			     "Note: UP with load-balancing disabled is reported as \"NOLB\"."
 			     "</td>"
@@ -1574,7 +1639,7 @@
 					continue;
 			}
 
-			if (sv->tracked)
+			if (sv->tracked && ! (sv->tracked->state & SRV_MAINTAIN))
 				svs = sv->tracked;
 			else
 				svs = sv;
@@ -1608,10 +1673,18 @@
 							       "UP %d/%d &darr;", "UP",
 							       "NOLB %d/%d &darr;", "NOLB",
 							       "<i>no check</i>" };
-				chunk_printf(&msg,
-				     /* name */
-				     "<tr class=\"%s%d\"><td class=ac",
-				     (sv->state & SRV_BACKUP) ? "backup" : "active", sv_state);
+				if (sv->state & SRV_MAINTAIN) {
+					chunk_printf(&msg,
+					    /* name */
+					    "<tr class=\"maintain\"><td class=ac"
+					);
+				}
+				else {
+					chunk_printf(&msg,
+					    /* name */
+					    "<tr class=\"%s%d\"><td class=ac",
+					    (sv->state & SRV_BACKUP) ? "backup" : "active", sv_state);
+				}
 
 				if (uri->flags&ST_SHLGNDS) {
 					char str[INET6_ADDRSTRLEN];
@@ -1693,7 +1766,12 @@
 				/* status, lest check */
 				chunk_printf(&msg, "<td class=ac>");
 
-				if (svs->state & SRV_CHECKED) {
+				if (sv->state & SRV_MAINTAIN) {
+					chunk_printf(&msg, "%s ",
+						human_time(now.tv_sec - sv->last_change, 1));
+					chunk_printf(&msg, "MAINT");
+				}
+				else if (svs->state & SRV_CHECKED) {
 					chunk_printf(&msg, "%s ",
 						human_time(now.tv_sec - sv->last_change, 1));
 
@@ -1751,10 +1829,10 @@
 					     "<td>%lld</td><td>%s</td>"
 					     "",
 					     svs->counters.down_trans, human_time(srv_downtime(sv), 1));
-				} else if (sv != svs)
+				} else if ((sv != svs) || (sv == svs && sv->tracked))
 					chunk_printf(&msg,
 					     "<td class=ac colspan=3><a class=lfsb href=\"#%s/%s\">via %s/%s<a></td>",
-							svs->proxy->id, svs->id, svs->proxy->id, svs->id);
+							sv->tracked->proxy->id, sv->tracked->id, sv->tracked->proxy->id, sv->tracked->id);
 				else
 					chunk_printf(&msg,
 					     "<td colspan=3></td>");
@@ -1801,10 +1879,14 @@
 				     sv->counters.retries, sv->counters.redispatches);
 
 				/* status */
-				chunk_printf(&msg,
-				     srv_hlt_st[sv_state],
-				     (sv->state & SRV_RUNNING) ? (sv->health - sv->rise + 1) : (sv->health),
-				     (sv->state & SRV_RUNNING) ? (sv->fall) : (sv->rise));
+				if (sv->state & SRV_MAINTAIN) {
+					chunk_printf(&msg, "MAINT,");
+				} else {
+					chunk_printf(&msg,
+					    srv_hlt_st[sv_state],
+					    (sv->state & SRV_RUNNING) ? (sv->health - sv->rise + 1) : (sv->health),
+					    (sv->state & SRV_RUNNING) ? (sv->fall) : (sv->rise));
+				}
 
 				chunk_printf(&msg,
 				     /* weight, active, backup */

[PATCH] add a maintenance mode to servers

Reply via email to