Add an external check which makes use of an external process to
check the status of a server.
---
v4
* Remove stray use of s->check in process_chk()
The check parameter should be used throughout process_chk()
* Layer 7 timeouts of agent checks should be ignored
* Ensure that argc is never used uninitialised in prepare_external_check()
v3
* Rebase: basically a rewrite of large sections of the code
* Merge with the following patches
+ "external-check: Actually execute command"
+ "Allow selection of of external-check in configuration file"
v2
* If the external command exits normally (WIFEXITED()) is true)
then set the check's code to the exit status (WEXITSTATUS())
of the process.
* Treat a timeout is a failure case rather than the test having passed
* Remove duplicate getnameinfo() call in start_checks()
* Remove duplicate assignment of sockaddr argument to getnameinfo(9
which caused the check port and check addr configuration of
a server to be ignored.
---
doc/configuration.txt | 24 ++++
include/types/checks.h | 4 +
include/types/proxy.h | 1 +
include/types/server.h | 26 +++-
src/cfgparse.c | 21 +++
src/checks.c | 351 ++++++++++++++++++++++++++++++++++++++++++++++--
6 files changed, 409 insertions(+), 18 deletions(-)
diff --git a/doc/configuration.txt b/doc/configuration.txt
index 3e58746..dcb27ba 100644
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@@ -3995,6 +3995,30 @@ option ldap-check
See also : "option httpchk"
+option external-check
+ Use external processes for server health checks
+ May be used in sections : defaults | frontend | listen | backend
+ yes | no | yes | yes
+ Arguments : external command to run
+
+ It is possible to test the health of a server using an external command.
+ This is achieved by running the command given as the argument to the
+ external-check option. The arguments to the command are:
+
+ proxy_address server_address check_address check_port
+
+ If the (health check) addr parameter has been provided as a parameter to
+ the server directive then it is used as the check_address, otherwise the
+ address of the server is used.
+
+ If the command executed and exits with a zero status then the check is
+ considered to have passed, otherwise the check is considered to have
+ failed.
+
+ Example :
+ option external-check /bin/true
+
+
option log-health-checks
no option log-health-checks
Enable or disable logging of health checks
diff --git a/include/types/checks.h b/include/types/checks.h
index 09a4eee..731db37 100644
--- a/include/types/checks.h
+++ b/include/types/checks.h
@@ -44,6 +44,10 @@ enum {
HCHK_STATUS_L7OKCD, /* L7 check conditionally passed */
HCHK_STATUS_L7STS, /* L7 response error, for example HTTP
5xx */
+ HCHK_STATUS_PROCERR, /* External process check failure */
+ HCHK_STATUS_PROCTOUT, /* External process check timeout */
+ HCHK_STATUS_PROCOK, /* External process check passed */
+
HCHK_STATUS_SIZE
};
diff --git a/include/types/proxy.h b/include/types/proxy.h
index 66e5db7..e608b9f 100644
--- a/include/types/proxy.h
+++ b/include/types/proxy.h
@@ -152,6 +152,7 @@ enum {
#define PR_O2_LDAP_CHK 0x60000000 /* use LDAP check for server health */
#define PR_O2_SSL3_CHK 0x70000000 /* use SSLv3 CLIENT_HELLO packets for
server health */
#define PR_O2_LB_AGENT_CHK 0x80000000 /* use a TCP connection to obtain a
metric of server health */
+#define PR_O2_EXT_CHK 0x90000000 /* use external command for server
health */
/* unused: 0x90000000 to 0xF000000, reserved for health checks */
#define PR_O2_CHK_ANY 0xF0000000 /* Mask to cover any check */
/* end of proxy->options2 */
diff --git a/include/types/server.h b/include/types/server.h
index cf80c7f..d55615b 100644
--- a/include/types/server.h
+++ b/include/types/server.h
@@ -24,6 +24,7 @@
#include <netinet/in.h>
#include <arpa/inet.h>
+#include <stdbool.h>
#ifdef USE_OPENSSL
#include <openssl/ssl.h>
@@ -99,6 +100,14 @@
#define SRV_SSL_O_NO_TLS_TICKETS 0x0100 /* disable session resumption tickets
*/
#endif
+struct pid_list {
+ struct list list;
+ pid_t pid;
+ struct task *t;
+ int status;
+ bool exited;
+};
+
/* A tree occurrence is a descriptor of a place in a tree, with a pointer back
* to the server itself.
*/
@@ -109,17 +118,24 @@ struct tree_occ {
};
struct check {
- struct connection *conn; /* connection state for health
checks */
-
+ union {
+ struct {
+ struct connection *conn;/* connection state for health
checks */
+ struct buffer *bi, *bo; /* input and output buffers to
send/recv check */
+ int use_ssl; /* use SSL for health checks */
+ int send_proxy; /* send a PROXY protocol header
with checks */
+ };
+ struct {
+ char **argv; /* the arguments to use if
running a process-based check */
+ struct pid_list *curpid;/* entry in pid_list used for
current process-based test, or -1 if not in test */
+ };
+ };
short port; /* the port to use for the
health checks */
- struct buffer *bi, *bo; /* input and output buffers to
send/recv check */
struct task *task; /* the task associated to the
health check processing, NULL if disabled */
struct timeval start; /* last health check start time
*/
long duration; /* time in ms took to finish
last health check */
short status, code; /* check result, check code */
char desc[HCHK_DESC_LEN]; /* health check descritpion */
- int use_ssl; /* use SSL for health checks */
- int send_proxy; /* send a PROXY protocol header
with checks */
int inter, fastinter, downinter; /* checks: time in milliseconds
*/
int result; /* health-check result :
SRV_CHK_* */
int state; /* health-check result : CHK_*
*/
diff --git a/src/cfgparse.c b/src/cfgparse.c
index 995d9ff..2a9e10d 100644
--- a/src/cfgparse.c
+++ b/src/cfgparse.c
@@ -1628,6 +1628,11 @@ out:
static int init_check(struct server *s, struct check *check, const char *
file, int linenum)
{
+ if (check->type == PR_O2_EXT_CHK) {
+ /* Nothing left to do for external checks */
+ return 0;
+ }
+
/* Allocate buffer for requests... */
if ((check->bi = calloc(sizeof(struct buffer) + global.tune.chksize,
sizeof(char))) == NULL) {
Alert("parsing [%s:%d] : out of memory while allocating check
buffer.\n", file, linenum);
@@ -3737,6 +3742,22 @@ stats_error_parsing:
memcpy(curproxy->check_req, DEF_LDAP_CHECK_REQ,
sizeof(DEF_LDAP_CHECK_REQ) - 1);
curproxy->check_len = sizeof(DEF_LDAP_CHECK_REQ) - 1;
}
+ else if (!strcmp(args[1], "external-check")) {
+ /* excute an external command to check servers' health
*/
+ free(curproxy->check_req);
+ curproxy->check_req = NULL;
+ curproxy->options2 &= ~PR_O2_CHK_ANY;
+ curproxy->options2 |= PR_O2_EXT_CHK;
+
+ if (!*(args[2])) {
+ Alert("parsing [%s:%d] : '%s' expects command
as argument.\n",
+ file, linenum, args[0]);
+ err_code |= ERR_ALERT | ERR_FATAL;
+ goto out;
+ }
+ curproxy->check_req = strdup(args[2]);
+ curproxy->check_len = strlen(curproxy->check_req);
+ }
else if (!strcmp(args[1], "forwardfor")) {
int cur_arg;
diff --git a/src/checks.c b/src/checks.c
index c61a5e9..bf63286 100644
--- a/src/checks.c
+++ b/src/checks.c
@@ -22,9 +22,11 @@
#include <unistd.h>
#include <sys/socket.h>
#include <sys/types.h>
+#include <sys/wait.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <arpa/inet.h>
+#include <netdb.h>
#include <common/chunk.h>
#include <common/compat.h>
@@ -78,6 +80,10 @@ static const struct check_status
check_statuses[HCHK_STATUS_SIZE] = {
[HCHK_STATUS_L7OKD] = { SRV_CHK_PASSED, "L7OK",
"Layer7 check passed" },
[HCHK_STATUS_L7OKCD] = { SRV_CHK_PASSED | SRV_CHK_DISABLE, "L7OKC",
"Layer7 check conditionally passed" },
[HCHK_STATUS_L7STS] = { SRV_CHK_FAILED, "L7STS",
"Layer7 wrong status" },
+
+ [HCHK_STATUS_PROCERR] = { SRV_CHK_FAILED,
"PROCERR", "External check error" },
+ [HCHK_STATUS_PROCTOUT] = { SRV_CHK_FAILED,
"PROCTOUT", "External check timeout" },
+ [HCHK_STATUS_PROCOK] = { SRV_CHK_PASSED,
"PROCOK", "External check passed" },
};
static const struct analyze_status analyze_statuses[HANA_STATUS_SIZE] = {
/* 0: ignore, 1: error, 2: OK */
@@ -231,7 +237,7 @@ static void set_server_check_status(struct check *check,
short status, const cha
/* Failure to connect to the agent as a secondary check should not
* cause the server to be marked down. So only log status changes
* for HCHK_STATUS_* statuses */
- if (check == &s->agent && check->status < HCHK_STATUS_L7TOUT)
+ if (check == &s->agent && check->status <= HCHK_STATUS_L7TOUT)
return;
if (s->proxy->options2 & PR_O2_LOGHCHKS &&
@@ -1385,7 +1391,7 @@ static void process_result(struct check *check)
}
/*
- * establish a server health-check.
+ * establish a server health-check that makes use of a connection.
*
* It can return one of :
* - SN_ERR_NONE if everything's OK
@@ -1396,7 +1402,7 @@ static void process_result(struct check *check)
* - SN_ERR_INTERNAL for any other purely internal errors
* Additionnally, in the case of SN_ERR_RESOURCE, an emergency log will be
emitted.
*/
-static int establish_chk(struct task *t)
+static int establish_conn_chk(struct task *t)
{
struct check *check = t->context;
struct server *s = check->server;
@@ -1453,6 +1459,293 @@ static int establish_chk(struct task *t)
return ret;
}
+static struct list pid_list = LIST_HEAD_INIT(pid_list);
+static struct pool_head *pool2_pid_list;
+
+void block_sigchld(void)
+{
+ sigset_t set;
+ sigemptyset(&set);
+ sigaddset(&set, SIGCHLD);
+ assert(sigprocmask(SIG_SETMASK, &set, NULL) == 0);
+}
+
+void unblock_sigchld(void)
+{
+ sigset_t set;
+ sigemptyset(&set);
+ assert(sigprocmask(SIG_SETMASK, &set, NULL) == 0);
+}
+
+/* Call with SIGCHLD blocked */
+static struct pid_list *pid_list_add(pid_t pid, struct task *t)
+{
+ struct pid_list *elem;
+ struct check *check = t->context;
+
+ elem = pool_alloc2(pool2_pid_list);
+ if (!elem)
+ return NULL;
+ elem->pid = pid;
+ elem->t = t;
+ elem->exited = 0;
+ check->curpid = elem;
+ LIST_INIT(&elem->list);
+ LIST_ADD(&pid_list, &elem->list);
+ return elem;
+}
+
+/* Blocks blocks and then unblocks SIGCHLD */
+static void pid_list_del(struct pid_list *elem)
+{
+ struct check *check;
+
+ if (!elem)
+ return;
+
+ block_sigchld();
+ LIST_DEL(&elem->list);
+ unblock_sigchld();
+ if (!elem->exited)
+ kill(elem->pid, SIGTERM);
+
+ check = elem->t->context;
+ check->curpid = NULL;
+ pool_free2(pool2_pid_list, elem);
+}
+
+/* Called from inside SIGCHLD handler, SIGCHLD is blocked */
+static void pid_list_expire(pid_t pid, int status)
+{
+ struct pid_list *elem;
+
+ list_for_each_entry(elem, &pid_list, list) {
+ if (elem->pid == pid) {
+ elem->t->expire = now_ms;
+ elem->status = status;
+ elem->exited = 1;
+ return;
+ }
+ }
+}
+
+static void sigchld_handler(int signal)
+{
+ pid_t pid;
+ int status;
+ while ((pid = waitpid(0, &status, WNOHANG)) > 0)
+ pid_list_expire(pid, status);
+}
+
+static int init_pid_list(void) {
+ struct sigaction action = {
+ .sa_handler = sigchld_handler,
+ .sa_flags = SA_NOCLDSTOP
+ };
+
+ if (pool2_pid_list != NULL)
+ /* Nothing to do */
+ return 0;
+
+ if (sigaction(SIGCHLD, &action, NULL)) {
+ Alert("Failed to set signal handler for external health checks:
%s. Aborting.\n",
+ strerror(errno));
+ return 1;
+ }
+
+ pool2_pid_list = create_pool("pid_list", sizeof(struct pid_list),
MEM_F_SHARED);
+ if (pool2_pid_list == NULL) {
+ Alert("Failed to allocate memory pool for external health
checks: %s. Aborting.\n",
+ strerror(errno));
+ return 1;
+ }
+
+ return 0;
+}
+
+
+static int prepare_external_check(struct check *check)
+{
+ char host[NI_MAXHOST] = {};
+ char serv[NI_MAXSERV] = {};
+ struct sockaddr_storage sa;
+ struct server *s = check->server;
+ struct proxy *px = s->proxy;
+ struct listener *listener = NULL, *l;
+ int i, argc;
+ const char *err_fmt = "Starting [%s:%s] check: out of memory.\n";
+
+ list_for_each_entry(l, &px->conf.listeners, by_fe)
+ /* Use the first INET, INET6 or UNIX listener */
+ if (l->addr.ss_family == AF_INET ||
+ l->addr.ss_family == AF_INET6 ||
+ l->addr.ss_family == AF_UNIX) {
+ listener = l;
+ break;
+ }
+
+ if (!listener) {
+ err_fmt = "Starting [%s:%s] check: no listener.\n";
+ goto err;
+ }
+
+ check->curpid = NULL;
+
+ check->argv = calloc(6, sizeof(check->argv));
+ if (!check->argv)
+ goto err;
+
+ check->argv[0] = px->check_req;
+
+ if (is_addr(&s->check_common.addr))
+ /* we'll connect to the check addr specified on the server */
+ sa = s->check_common.addr;
+ else
+ /* we'll connect to the addr on the server */
+ sa = s->addr;
+ set_host_port(&sa, check->port);
+
+ if (getnameinfo((struct sockaddr *)&sa, sizeof(sa),
+ host, sizeof(host), serv, sizeof(serv),
+ NI_NUMERICHOST | NI_NUMERICSERV)) {
+ err_fmt = "Starting [%s:%s] check: getnameinfo failed on check
address.\n";
+ abort();
+ goto err_free;
+ }
+
+ check->argv[1] = strdup(host);
+ check->argv[2] = strdup(serv);
+
+ if (listener->addr.ss_family == AF_INET ||
+ listener->addr.ss_family == AF_INET6) {
+ if (getnameinfo((struct sockaddr *)&listener->addr,
+ sizeof(listener->addr), host, sizeof(host),
+ serv, sizeof(serv),
+ NI_NUMERICHOST | NI_NUMERICSERV)) {
+ err_fmt = "Starting [%s:%s] check: getnameinfo failed
on listener address.\n";
+ goto err_free;
+ }
+ check->argv[3] = strdup(host);
+ check->argv[4] = strdup(serv);
+ argc = 5;
+ } else if (listener->addr.ss_family == AF_UNIX) {
+ const struct sockaddr_un *un;
+
+ un = (struct sockaddr_un *)&listener->addr;
+ check->argv[3] = strdup(un->sun_path);
+ argc = 4;
+ } else {
+ goto err;
+ }
+
+ for (i = 0; i < argc; i++)
+ if (!check->argv[i])
+ goto err_free;
+
+ return 0;
+err_free:
+ for (i = 1; i < 5; i++)
+ free(check->argv[i]);
+ free(check->argv);
+ check->argv = NULL;
+err:
+ Alert(err_fmt, px->id, s->id);
+ return -1;
+}
+
+/*
+ * establish a server health-check that makes use of a process.
+ *
+ * It can return one of :
+ * - SN_ERR_NONE if everything's OK
+ * - SN_ERR_SRVTO if there are no more servers
+ * - SN_ERR_SRVCL if the connection was refused by the server
+ * - SN_ERR_PRXCOND if the connection has been limited by the proxy (maxconn)
+ * - SN_ERR_RESOURCE if a system resource is lacking (eg: fd limits, ports,
...)
+ * - SN_ERR_INTERNAL for any other purely internal errors
+ * Additionnally, in the case of SN_ERR_RESOURCE, an emergency log will be
emitted.
+ *
+ * Blocks and then unblocks SIGCHLD
+ */
+static int establish_proc_chk(struct task *t)
+{
+ struct check *check = t->context;
+ struct server *s = check->server;
+ struct proxy *px = s->proxy;
+ int status;
+ pid_t pid;
+
+ if (!check->argv) {
+ status = prepare_external_check(check);
+ if (status < 0)
+ return SN_ERR_RESOURCE;
+ }
+
+ status = SN_ERR_RESOURCE;
+
+ block_sigchld();
+
+ pid = fork();
+ if (pid < 0) {
+ Alert("Failed to fork process for external health check: %s.
Aborting.\n",
+ strerror(errno));
+ set_server_check_status(check, HCHK_STATUS_SOCKERR,
strerror(errno));
+ goto out;
+ }
+ if (pid == 0) {
+ /* Child */
+ execvp(px->check_req, check->argv);
+ Alert("Failed to exec process for external health check: %s.
Aborting.\n",
+ strerror(errno));
+ exit(-1);
+ }
+
+ /* Parent */
+ if (check->result == SRV_CHK_UNKNOWN) {
+ if (pid_list_add(pid, t) != NULL) {
+ t->expire = tick_add(now_ms, MS_TO_TICKS(check->inter));
+
+ if (px->timeout.check && px->timeout.connect) {
+ int t_con = tick_add(now_ms,
px->timeout.connect);
+ t->expire = tick_first(t->expire, t_con);
+ }
+ status = SN_ERR_NONE;
+ goto out;
+ }
+ else {
+ set_server_check_status(check, HCHK_STATUS_SOCKERR,
strerror(errno));
+ }
+ kill(pid, SIGTERM); /* process creation error */
+ }
+ else
+ set_server_check_status(check, HCHK_STATUS_SOCKERR,
strerror(errno));
+
+out:
+ unblock_sigchld();
+ return status;
+}
+
+/*
+ * establish a server health-check.
+ *
+ * It can return one of :
+ * - SN_ERR_NONE if everything's OK
+ * - SN_ERR_SRVTO if there are no more servers
+ * - SN_ERR_SRVCL if the connection was refused by the server
+ * - SN_ERR_PRXCOND if the connection has been limited by the proxy (maxconn)
+ * - SN_ERR_RESOURCE if a system resource is lacking (eg: fd limits, ports,
...)
+ * - SN_ERR_INTERNAL for any other purely internal errors
+ * Additionnally, in the case of SN_ERR_RESOURCE, an emergency log will be
emitted.
+ */
+static int establish_chk(struct task *t)
+{
+ struct check *check = t->context;
+
+ if (check->type == PR_O2_EXT_CHK)
+ return establish_proc_chk(t);
+ return establish_conn_chk(t);
+}
+
/*
* manages a server health-check. Returns
* the time the task accepts to wait, or TIME_ETERNITY for infinity.
@@ -1465,6 +1758,7 @@ static struct task *process_chk(struct task *t)
int rv;
int ret;
int expired = tick_is_expired(t->expire, now_ms);
+ int has_conn = check->type != PR_O2_EXT_CHK;
if (!(check->state & CHK_RUNNING)) {
/* no check currently running */
@@ -1485,10 +1779,13 @@ static struct task *process_chk(struct task *t)
set_server_check_status(check, HCHK_STATUS_START, NULL);
check->state |= CHK_RUNNING;
- check->bi->p = check->bi->data;
- check->bi->i = 0;
- check->bo->p = check->bo->data;
- check->bo->o = 0;
+
+ if (has_conn) {
+ check->bi->p = check->bi->data;
+ check->bi->i = 0;
+ check->bo->p = check->bo->data;
+ check->bo->o = 0;
+ }
ret = establish_chk(t);
switch (ret) {
@@ -1503,7 +1800,9 @@ static struct task *process_chk(struct task *t)
int t_con = tick_add(now_ms,
s->proxy->timeout.connect);
t->expire = tick_first(t->expire, t_con);
}
- conn_data_poll_recv(conn); /* prepare for reading a
possible reply */
+ if (has_conn) {
+ conn_data_poll_recv(conn); /* prepare for
reading a possible reply */
+ }
goto reschedule;
case SN_ERR_SRVTO: /* ETIMEDOUT */
@@ -1541,15 +1840,15 @@ static struct task *process_chk(struct task *t)
* First, let's check whether there was an uncaught error,
* which can happen on connect timeout or error.
*/
- if (s->check.result == SRV_CHK_UNKNOWN) {
- if ((conn->flags &
(CO_FL_CONNECTED|CO_FL_WAIT_L4_CONN)) == CO_FL_WAIT_L4_CONN) {
+ if (check->result == SRV_CHK_UNKNOWN) {
+ if (has_conn && (conn->flags &
(CO_FL_CONNECTED|CO_FL_WAIT_L4_CONN)) == CO_FL_WAIT_L4_CONN) {
/* L4 not established (yet) */
if (conn->flags & CO_FL_ERROR)
set_server_check_status(check,
HCHK_STATUS_L4CON, NULL);
else if (expired)
set_server_check_status(check,
HCHK_STATUS_L4TOUT, NULL);
}
- else if ((conn->flags &
(CO_FL_CONNECTED|CO_FL_WAIT_L6_CONN)) == CO_FL_WAIT_L6_CONN) {
+ else if (has_conn && (conn->flags &
(CO_FL_CONNECTED|CO_FL_WAIT_L6_CONN)) == CO_FL_WAIT_L6_CONN) {
/* L6 not established (yet) */
if (conn->flags & CO_FL_ERROR)
set_server_check_status(check,
HCHK_STATUS_L6RSP, NULL);
@@ -1563,6 +1862,23 @@ static struct task *process_chk(struct task *t)
else
set_server_check_status(check,
HCHK_STATUS_L4OK, NULL);
}
+ else if (!has_conn) {
+ struct pid_list *elem = check->curpid;
+ int status;
+
+ if (elem->exited) {
+ status = elem->status; /* Save in case
the process exits between use below */
+ if (!WIFEXITED(status))
+ check->code = -1;
+ else
+ check->code =
WEXITSTATUS(status);
+ if (!WIFEXITED(status) ||
WEXITSTATUS(status))
+ status = HCHK_STATUS_PROCERR;
+ else
+ status = HCHK_STATUS_PROCOK;
+ set_server_check_status(check, status,
NULL);
+ }
+ }
else if (expired) {
/* connection established but expired check */
if (check->type == PR_O2_SSL3_CHK)
@@ -1576,7 +1892,7 @@ static struct task *process_chk(struct task *t)
}
/* check complete or aborted */
- if (conn->xprt) {
+ if (has_conn && conn->xprt) {
/* The check was aborted and the connection was not yet
closed.
* This can happen upon timeout, or when an external
event such
* as a failed response coupled with "observe layer7"
caused the
@@ -1600,6 +1916,9 @@ static struct task *process_chk(struct task *t)
process_result(&check->server->agent);
}
+ if (!has_conn)
+ pid_list_del(check->curpid);
+
rv = 0;
if (global.spread_checks > 0) {
rv = srv_getinter(check) * global.spread_checks / 100;
@@ -1614,7 +1933,6 @@ static struct task *process_chk(struct task *t)
out_wait:
return t;
}
-
static int start_check_task(struct check *check, int mininter,
int nbcheck, int srvpos)
{
@@ -1683,6 +2001,13 @@ int start_checks() {
* the number of servers, weighted by the server's position in the list.
*/
for (px = proxy; px; px = px->next) {
+ if ((px->options2 & PR_O2_CHK_ANY) == PR_O2_EXT_CHK) {
+ if (init_pid_list()) {
+ Alert("Starting [%s] check: out of memory.\n",
px->id);
+ return -1;
+ }
+ }
+
for (s = px->srv; s; s = s->next) {
if (s->slowstart) {
if ((t = task_new()) == NULL) {
--
1.7.10.4