Add an external check which makes use of an external process to check the status of a server.
--- v4 * Remove stray use of s->check in process_chk() The check parameter should be used throughout process_chk() * Layer 7 timeouts of agent checks should be ignored * Ensure that argc is never used uninitialised in prepare_external_check() v3 * Rebase: basically a rewrite of large sections of the code * Merge with the following patches + "external-check: Actually execute command" + "Allow selection of of external-check in configuration file" v2 * If the external command exits normally (WIFEXITED()) is true) then set the check's code to the exit status (WEXITSTATUS()) of the process. * Treat a timeout is a failure case rather than the test having passed * Remove duplicate getnameinfo() call in start_checks() * Remove duplicate assignment of sockaddr argument to getnameinfo(9 which caused the check port and check addr configuration of a server to be ignored. --- doc/configuration.txt | 24 ++++ include/types/checks.h | 4 + include/types/proxy.h | 1 + include/types/server.h | 26 +++- src/cfgparse.c | 21 +++ src/checks.c | 351 ++++++++++++++++++++++++++++++++++++++++++++++-- 6 files changed, 409 insertions(+), 18 deletions(-) diff --git a/doc/configuration.txt b/doc/configuration.txt index 3e58746..dcb27ba 100644 --- a/doc/configuration.txt +++ b/doc/configuration.txt @@ -3995,6 +3995,30 @@ option ldap-check See also : "option httpchk" +option external-check + Use external processes for server health checks + May be used in sections : defaults | frontend | listen | backend + yes | no | yes | yes + Arguments : external command to run + + It is possible to test the health of a server using an external command. + This is achieved by running the command given as the argument to the + external-check option. The arguments to the command are: + + proxy_address server_address check_address check_port + + If the (health check) addr parameter has been provided as a parameter to + the server directive then it is used as the check_address, otherwise the + address of the server is used. + + If the command executed and exits with a zero status then the check is + considered to have passed, otherwise the check is considered to have + failed. + + Example : + option external-check /bin/true + + option log-health-checks no option log-health-checks Enable or disable logging of health checks diff --git a/include/types/checks.h b/include/types/checks.h index 09a4eee..731db37 100644 --- a/include/types/checks.h +++ b/include/types/checks.h @@ -44,6 +44,10 @@ enum { HCHK_STATUS_L7OKCD, /* L7 check conditionally passed */ HCHK_STATUS_L7STS, /* L7 response error, for example HTTP 5xx */ + HCHK_STATUS_PROCERR, /* External process check failure */ + HCHK_STATUS_PROCTOUT, /* External process check timeout */ + HCHK_STATUS_PROCOK, /* External process check passed */ + HCHK_STATUS_SIZE }; diff --git a/include/types/proxy.h b/include/types/proxy.h index 66e5db7..e608b9f 100644 --- a/include/types/proxy.h +++ b/include/types/proxy.h @@ -152,6 +152,7 @@ enum { #define PR_O2_LDAP_CHK 0x60000000 /* use LDAP check for server health */ #define PR_O2_SSL3_CHK 0x70000000 /* use SSLv3 CLIENT_HELLO packets for server health */ #define PR_O2_LB_AGENT_CHK 0x80000000 /* use a TCP connection to obtain a metric of server health */ +#define PR_O2_EXT_CHK 0x90000000 /* use external command for server health */ /* unused: 0x90000000 to 0xF000000, reserved for health checks */ #define PR_O2_CHK_ANY 0xF0000000 /* Mask to cover any check */ /* end of proxy->options2 */ diff --git a/include/types/server.h b/include/types/server.h index cf80c7f..d55615b 100644 --- a/include/types/server.h +++ b/include/types/server.h @@ -24,6 +24,7 @@ #include <netinet/in.h> #include <arpa/inet.h> +#include <stdbool.h> #ifdef USE_OPENSSL #include <openssl/ssl.h> @@ -99,6 +100,14 @@ #define SRV_SSL_O_NO_TLS_TICKETS 0x0100 /* disable session resumption tickets */ #endif +struct pid_list { + struct list list; + pid_t pid; + struct task *t; + int status; + bool exited; +}; + /* A tree occurrence is a descriptor of a place in a tree, with a pointer back * to the server itself. */ @@ -109,17 +118,24 @@ struct tree_occ { }; struct check { - struct connection *conn; /* connection state for health checks */ - + union { + struct { + struct connection *conn;/* connection state for health checks */ + struct buffer *bi, *bo; /* input and output buffers to send/recv check */ + int use_ssl; /* use SSL for health checks */ + int send_proxy; /* send a PROXY protocol header with checks */ + }; + struct { + char **argv; /* the arguments to use if running a process-based check */ + struct pid_list *curpid;/* entry in pid_list used for current process-based test, or -1 if not in test */ + }; + }; short port; /* the port to use for the health checks */ - struct buffer *bi, *bo; /* input and output buffers to send/recv check */ struct task *task; /* the task associated to the health check processing, NULL if disabled */ struct timeval start; /* last health check start time */ long duration; /* time in ms took to finish last health check */ short status, code; /* check result, check code */ char desc[HCHK_DESC_LEN]; /* health check descritpion */ - int use_ssl; /* use SSL for health checks */ - int send_proxy; /* send a PROXY protocol header with checks */ int inter, fastinter, downinter; /* checks: time in milliseconds */ int result; /* health-check result : SRV_CHK_* */ int state; /* health-check result : CHK_* */ diff --git a/src/cfgparse.c b/src/cfgparse.c index 995d9ff..2a9e10d 100644 --- a/src/cfgparse.c +++ b/src/cfgparse.c @@ -1628,6 +1628,11 @@ out: static int init_check(struct server *s, struct check *check, const char * file, int linenum) { + if (check->type == PR_O2_EXT_CHK) { + /* Nothing left to do for external checks */ + return 0; + } + /* Allocate buffer for requests... */ if ((check->bi = calloc(sizeof(struct buffer) + global.tune.chksize, sizeof(char))) == NULL) { Alert("parsing [%s:%d] : out of memory while allocating check buffer.\n", file, linenum); @@ -3737,6 +3742,22 @@ stats_error_parsing: memcpy(curproxy->check_req, DEF_LDAP_CHECK_REQ, sizeof(DEF_LDAP_CHECK_REQ) - 1); curproxy->check_len = sizeof(DEF_LDAP_CHECK_REQ) - 1; } + else if (!strcmp(args[1], "external-check")) { + /* excute an external command to check servers' health */ + free(curproxy->check_req); + curproxy->check_req = NULL; + curproxy->options2 &= ~PR_O2_CHK_ANY; + curproxy->options2 |= PR_O2_EXT_CHK; + + if (!*(args[2])) { + Alert("parsing [%s:%d] : '%s' expects command as argument.\n", + file, linenum, args[0]); + err_code |= ERR_ALERT | ERR_FATAL; + goto out; + } + curproxy->check_req = strdup(args[2]); + curproxy->check_len = strlen(curproxy->check_req); + } else if (!strcmp(args[1], "forwardfor")) { int cur_arg; diff --git a/src/checks.c b/src/checks.c index c61a5e9..bf63286 100644 --- a/src/checks.c +++ b/src/checks.c @@ -22,9 +22,11 @@ #include <unistd.h> #include <sys/socket.h> #include <sys/types.h> +#include <sys/wait.h> #include <netinet/in.h> #include <netinet/tcp.h> #include <arpa/inet.h> +#include <netdb.h> #include <common/chunk.h> #include <common/compat.h> @@ -78,6 +80,10 @@ static const struct check_status check_statuses[HCHK_STATUS_SIZE] = { [HCHK_STATUS_L7OKD] = { SRV_CHK_PASSED, "L7OK", "Layer7 check passed" }, [HCHK_STATUS_L7OKCD] = { SRV_CHK_PASSED | SRV_CHK_DISABLE, "L7OKC", "Layer7 check conditionally passed" }, [HCHK_STATUS_L7STS] = { SRV_CHK_FAILED, "L7STS", "Layer7 wrong status" }, + + [HCHK_STATUS_PROCERR] = { SRV_CHK_FAILED, "PROCERR", "External check error" }, + [HCHK_STATUS_PROCTOUT] = { SRV_CHK_FAILED, "PROCTOUT", "External check timeout" }, + [HCHK_STATUS_PROCOK] = { SRV_CHK_PASSED, "PROCOK", "External check passed" }, }; static const struct analyze_status analyze_statuses[HANA_STATUS_SIZE] = { /* 0: ignore, 1: error, 2: OK */ @@ -231,7 +237,7 @@ static void set_server_check_status(struct check *check, short status, const cha /* Failure to connect to the agent as a secondary check should not * cause the server to be marked down. So only log status changes * for HCHK_STATUS_* statuses */ - if (check == &s->agent && check->status < HCHK_STATUS_L7TOUT) + if (check == &s->agent && check->status <= HCHK_STATUS_L7TOUT) return; if (s->proxy->options2 & PR_O2_LOGHCHKS && @@ -1385,7 +1391,7 @@ static void process_result(struct check *check) } /* - * establish a server health-check. + * establish a server health-check that makes use of a connection. * * It can return one of : * - SN_ERR_NONE if everything's OK @@ -1396,7 +1402,7 @@ static void process_result(struct check *check) * - SN_ERR_INTERNAL for any other purely internal errors * Additionnally, in the case of SN_ERR_RESOURCE, an emergency log will be emitted. */ -static int establish_chk(struct task *t) +static int establish_conn_chk(struct task *t) { struct check *check = t->context; struct server *s = check->server; @@ -1453,6 +1459,293 @@ static int establish_chk(struct task *t) return ret; } +static struct list pid_list = LIST_HEAD_INIT(pid_list); +static struct pool_head *pool2_pid_list; + +void block_sigchld(void) +{ + sigset_t set; + sigemptyset(&set); + sigaddset(&set, SIGCHLD); + assert(sigprocmask(SIG_SETMASK, &set, NULL) == 0); +} + +void unblock_sigchld(void) +{ + sigset_t set; + sigemptyset(&set); + assert(sigprocmask(SIG_SETMASK, &set, NULL) == 0); +} + +/* Call with SIGCHLD blocked */ +static struct pid_list *pid_list_add(pid_t pid, struct task *t) +{ + struct pid_list *elem; + struct check *check = t->context; + + elem = pool_alloc2(pool2_pid_list); + if (!elem) + return NULL; + elem->pid = pid; + elem->t = t; + elem->exited = 0; + check->curpid = elem; + LIST_INIT(&elem->list); + LIST_ADD(&pid_list, &elem->list); + return elem; +} + +/* Blocks blocks and then unblocks SIGCHLD */ +static void pid_list_del(struct pid_list *elem) +{ + struct check *check; + + if (!elem) + return; + + block_sigchld(); + LIST_DEL(&elem->list); + unblock_sigchld(); + if (!elem->exited) + kill(elem->pid, SIGTERM); + + check = elem->t->context; + check->curpid = NULL; + pool_free2(pool2_pid_list, elem); +} + +/* Called from inside SIGCHLD handler, SIGCHLD is blocked */ +static void pid_list_expire(pid_t pid, int status) +{ + struct pid_list *elem; + + list_for_each_entry(elem, &pid_list, list) { + if (elem->pid == pid) { + elem->t->expire = now_ms; + elem->status = status; + elem->exited = 1; + return; + } + } +} + +static void sigchld_handler(int signal) +{ + pid_t pid; + int status; + while ((pid = waitpid(0, &status, WNOHANG)) > 0) + pid_list_expire(pid, status); +} + +static int init_pid_list(void) { + struct sigaction action = { + .sa_handler = sigchld_handler, + .sa_flags = SA_NOCLDSTOP + }; + + if (pool2_pid_list != NULL) + /* Nothing to do */ + return 0; + + if (sigaction(SIGCHLD, &action, NULL)) { + Alert("Failed to set signal handler for external health checks: %s. Aborting.\n", + strerror(errno)); + return 1; + } + + pool2_pid_list = create_pool("pid_list", sizeof(struct pid_list), MEM_F_SHARED); + if (pool2_pid_list == NULL) { + Alert("Failed to allocate memory pool for external health checks: %s. Aborting.\n", + strerror(errno)); + return 1; + } + + return 0; +} + + +static int prepare_external_check(struct check *check) +{ + char host[NI_MAXHOST] = {}; + char serv[NI_MAXSERV] = {}; + struct sockaddr_storage sa; + struct server *s = check->server; + struct proxy *px = s->proxy; + struct listener *listener = NULL, *l; + int i, argc; + const char *err_fmt = "Starting [%s:%s] check: out of memory.\n"; + + list_for_each_entry(l, &px->conf.listeners, by_fe) + /* Use the first INET, INET6 or UNIX listener */ + if (l->addr.ss_family == AF_INET || + l->addr.ss_family == AF_INET6 || + l->addr.ss_family == AF_UNIX) { + listener = l; + break; + } + + if (!listener) { + err_fmt = "Starting [%s:%s] check: no listener.\n"; + goto err; + } + + check->curpid = NULL; + + check->argv = calloc(6, sizeof(check->argv)); + if (!check->argv) + goto err; + + check->argv[0] = px->check_req; + + if (is_addr(&s->check_common.addr)) + /* we'll connect to the check addr specified on the server */ + sa = s->check_common.addr; + else + /* we'll connect to the addr on the server */ + sa = s->addr; + set_host_port(&sa, check->port); + + if (getnameinfo((struct sockaddr *)&sa, sizeof(sa), + host, sizeof(host), serv, sizeof(serv), + NI_NUMERICHOST | NI_NUMERICSERV)) { + err_fmt = "Starting [%s:%s] check: getnameinfo failed on check address.\n"; + abort(); + goto err_free; + } + + check->argv[1] = strdup(host); + check->argv[2] = strdup(serv); + + if (listener->addr.ss_family == AF_INET || + listener->addr.ss_family == AF_INET6) { + if (getnameinfo((struct sockaddr *)&listener->addr, + sizeof(listener->addr), host, sizeof(host), + serv, sizeof(serv), + NI_NUMERICHOST | NI_NUMERICSERV)) { + err_fmt = "Starting [%s:%s] check: getnameinfo failed on listener address.\n"; + goto err_free; + } + check->argv[3] = strdup(host); + check->argv[4] = strdup(serv); + argc = 5; + } else if (listener->addr.ss_family == AF_UNIX) { + const struct sockaddr_un *un; + + un = (struct sockaddr_un *)&listener->addr; + check->argv[3] = strdup(un->sun_path); + argc = 4; + } else { + goto err; + } + + for (i = 0; i < argc; i++) + if (!check->argv[i]) + goto err_free; + + return 0; +err_free: + for (i = 1; i < 5; i++) + free(check->argv[i]); + free(check->argv); + check->argv = NULL; +err: + Alert(err_fmt, px->id, s->id); + return -1; +} + +/* + * establish a server health-check that makes use of a process. + * + * It can return one of : + * - SN_ERR_NONE if everything's OK + * - SN_ERR_SRVTO if there are no more servers + * - SN_ERR_SRVCL if the connection was refused by the server + * - SN_ERR_PRXCOND if the connection has been limited by the proxy (maxconn) + * - SN_ERR_RESOURCE if a system resource is lacking (eg: fd limits, ports, ...) + * - SN_ERR_INTERNAL for any other purely internal errors + * Additionnally, in the case of SN_ERR_RESOURCE, an emergency log will be emitted. + * + * Blocks and then unblocks SIGCHLD + */ +static int establish_proc_chk(struct task *t) +{ + struct check *check = t->context; + struct server *s = check->server; + struct proxy *px = s->proxy; + int status; + pid_t pid; + + if (!check->argv) { + status = prepare_external_check(check); + if (status < 0) + return SN_ERR_RESOURCE; + } + + status = SN_ERR_RESOURCE; + + block_sigchld(); + + pid = fork(); + if (pid < 0) { + Alert("Failed to fork process for external health check: %s. Aborting.\n", + strerror(errno)); + set_server_check_status(check, HCHK_STATUS_SOCKERR, strerror(errno)); + goto out; + } + if (pid == 0) { + /* Child */ + execvp(px->check_req, check->argv); + Alert("Failed to exec process for external health check: %s. Aborting.\n", + strerror(errno)); + exit(-1); + } + + /* Parent */ + if (check->result == SRV_CHK_UNKNOWN) { + if (pid_list_add(pid, t) != NULL) { + t->expire = tick_add(now_ms, MS_TO_TICKS(check->inter)); + + if (px->timeout.check && px->timeout.connect) { + int t_con = tick_add(now_ms, px->timeout.connect); + t->expire = tick_first(t->expire, t_con); + } + status = SN_ERR_NONE; + goto out; + } + else { + set_server_check_status(check, HCHK_STATUS_SOCKERR, strerror(errno)); + } + kill(pid, SIGTERM); /* process creation error */ + } + else + set_server_check_status(check, HCHK_STATUS_SOCKERR, strerror(errno)); + +out: + unblock_sigchld(); + return status; +} + +/* + * establish a server health-check. + * + * It can return one of : + * - SN_ERR_NONE if everything's OK + * - SN_ERR_SRVTO if there are no more servers + * - SN_ERR_SRVCL if the connection was refused by the server + * - SN_ERR_PRXCOND if the connection has been limited by the proxy (maxconn) + * - SN_ERR_RESOURCE if a system resource is lacking (eg: fd limits, ports, ...) + * - SN_ERR_INTERNAL for any other purely internal errors + * Additionnally, in the case of SN_ERR_RESOURCE, an emergency log will be emitted. + */ +static int establish_chk(struct task *t) +{ + struct check *check = t->context; + + if (check->type == PR_O2_EXT_CHK) + return establish_proc_chk(t); + return establish_conn_chk(t); +} + /* * manages a server health-check. Returns * the time the task accepts to wait, or TIME_ETERNITY for infinity. @@ -1465,6 +1758,7 @@ static struct task *process_chk(struct task *t) int rv; int ret; int expired = tick_is_expired(t->expire, now_ms); + int has_conn = check->type != PR_O2_EXT_CHK; if (!(check->state & CHK_RUNNING)) { /* no check currently running */ @@ -1485,10 +1779,13 @@ static struct task *process_chk(struct task *t) set_server_check_status(check, HCHK_STATUS_START, NULL); check->state |= CHK_RUNNING; - check->bi->p = check->bi->data; - check->bi->i = 0; - check->bo->p = check->bo->data; - check->bo->o = 0; + + if (has_conn) { + check->bi->p = check->bi->data; + check->bi->i = 0; + check->bo->p = check->bo->data; + check->bo->o = 0; + } ret = establish_chk(t); switch (ret) { @@ -1503,7 +1800,9 @@ static struct task *process_chk(struct task *t) int t_con = tick_add(now_ms, s->proxy->timeout.connect); t->expire = tick_first(t->expire, t_con); } - conn_data_poll_recv(conn); /* prepare for reading a possible reply */ + if (has_conn) { + conn_data_poll_recv(conn); /* prepare for reading a possible reply */ + } goto reschedule; case SN_ERR_SRVTO: /* ETIMEDOUT */ @@ -1541,15 +1840,15 @@ static struct task *process_chk(struct task *t) * First, let's check whether there was an uncaught error, * which can happen on connect timeout or error. */ - if (s->check.result == SRV_CHK_UNKNOWN) { - if ((conn->flags & (CO_FL_CONNECTED|CO_FL_WAIT_L4_CONN)) == CO_FL_WAIT_L4_CONN) { + if (check->result == SRV_CHK_UNKNOWN) { + if (has_conn && (conn->flags & (CO_FL_CONNECTED|CO_FL_WAIT_L4_CONN)) == CO_FL_WAIT_L4_CONN) { /* L4 not established (yet) */ if (conn->flags & CO_FL_ERROR) set_server_check_status(check, HCHK_STATUS_L4CON, NULL); else if (expired) set_server_check_status(check, HCHK_STATUS_L4TOUT, NULL); } - else if ((conn->flags & (CO_FL_CONNECTED|CO_FL_WAIT_L6_CONN)) == CO_FL_WAIT_L6_CONN) { + else if (has_conn && (conn->flags & (CO_FL_CONNECTED|CO_FL_WAIT_L6_CONN)) == CO_FL_WAIT_L6_CONN) { /* L6 not established (yet) */ if (conn->flags & CO_FL_ERROR) set_server_check_status(check, HCHK_STATUS_L6RSP, NULL); @@ -1563,6 +1862,23 @@ static struct task *process_chk(struct task *t) else set_server_check_status(check, HCHK_STATUS_L4OK, NULL); } + else if (!has_conn) { + struct pid_list *elem = check->curpid; + int status; + + if (elem->exited) { + status = elem->status; /* Save in case the process exits between use below */ + if (!WIFEXITED(status)) + check->code = -1; + else + check->code = WEXITSTATUS(status); + if (!WIFEXITED(status) || WEXITSTATUS(status)) + status = HCHK_STATUS_PROCERR; + else + status = HCHK_STATUS_PROCOK; + set_server_check_status(check, status, NULL); + } + } else if (expired) { /* connection established but expired check */ if (check->type == PR_O2_SSL3_CHK) @@ -1576,7 +1892,7 @@ static struct task *process_chk(struct task *t) } /* check complete or aborted */ - if (conn->xprt) { + if (has_conn && conn->xprt) { /* The check was aborted and the connection was not yet closed. * This can happen upon timeout, or when an external event such * as a failed response coupled with "observe layer7" caused the @@ -1600,6 +1916,9 @@ static struct task *process_chk(struct task *t) process_result(&check->server->agent); } + if (!has_conn) + pid_list_del(check->curpid); + rv = 0; if (global.spread_checks > 0) { rv = srv_getinter(check) * global.spread_checks / 100; @@ -1614,7 +1933,6 @@ static struct task *process_chk(struct task *t) out_wait: return t; } - static int start_check_task(struct check *check, int mininter, int nbcheck, int srvpos) { @@ -1683,6 +2001,13 @@ int start_checks() { * the number of servers, weighted by the server's position in the list. */ for (px = proxy; px; px = px->next) { + if ((px->options2 & PR_O2_CHK_ANY) == PR_O2_EXT_CHK) { + if (init_pid_list()) { + Alert("Starting [%s] check: out of memory.\n", px->id); + return -1; + } + } + for (s = px->srv; s; s = s->next) { if (s->slowstart) { if ((t = task_new()) == NULL) { -- 1.7.10.4