Subject: [RFC] Decrease server health based on http responses / events

This RFC quality patch implements decreasing server health based on
observing communication between HAProxy and servers.

I have had a working patch for this for a long time, however I needed to
rewrite nearly everything to remove hardcoded values, add more modes and
to port it into 1.4. So after the rework there is nearly nothing left from
the old code. :| In the current status the code is expected to work but it
definitely needs more testing.

BTW: I'm not very happy with names of both functions and parameters,
If you have a better idea please don't hesitate to propose it. ;)

TODO: documentation, comments, pure tcp support.

diff --git a/include/common/defaults.h b/include/common/defaults.h
index b0aee86..ae2f65c 100644
--- a/include/common/defaults.h
+++ b/include/common/defaults.h
@@ -120,6 +120,9 @@
 #define DEF_CHECK_REQ   "OPTIONS / HTTP/1.0\r\n\r\n"
 #define DEF_SMTP_CHECK_REQ   "HELO localhost\r\n"
 
+#define DEF_HANA_ONERR HANA_ONERR_FAILCHK
+#define DEF_CELIMIT    10
+
 // X-Forwarded-For header default
 #define DEF_XFORWARDFOR_HDR    "X-Forwarded-For"
 
diff --git a/include/proto/checks.h b/include/proto/checks.h
index bd70164..2d16976 100644
--- a/include/proto/checks.h
+++ b/include/proto/checks.h
@@ -29,6 +29,7 @@ const char *get_check_status_description(short check_status);
 const char *get_check_status_info(short check_status);
 struct task *process_chk(struct task *t);
 int start_checks();
+void halth_analyze(struct server *s, short status);
 
 #endif /* _PROTO_CHECKS_H */
 
diff --git a/include/types/checks.h b/include/types/checks.h
index 1b04608..3690aa5 100644
--- a/include/types/checks.h
+++ b/include/types/checks.h
@@ -18,6 +18,9 @@ enum {
 
        /* Below we have finished checks */
        HCHK_STATUS_CHECKED,            /* DUMMY STATUS */
+
+       HCHK_STATUS_HANA,               /* Detected enough consecutive errors */
+
        HCHK_STATUS_SOCKERR,            /* Socket error */
 
        HCHK_STATUS_L4OK,               /* L4 check passed, for example tcp 
connect */
@@ -41,6 +44,39 @@ enum {
        HCHK_STATUS_SIZE
 };
 
+enum {
+       HANA_UNKNOWN    = 0,
+
+       HANA_TCP_OK,
+
+       HANA_HTTP_OK,
+       HANA_HTTP_STS,
+       HANA_HTTP_HDRRSP,
+       HANA_HTTP_RSP,
+
+       HANA_READ_ERROR,
+       HANA_READ_TIMEOUT,
+       HANA_BROKEN_PIPE,
+
+       HANA_SIZE
+};
+
+enum {
+       HANA_ONERR_UNKNOWN      = 0,
+
+       HANA_ONERR_FASTINTER,
+       HANA_ONERR_FAILCHK,
+       HANA_ONERR_SUDDTH,
+       HANA_ONERR_MARKDWN,
+};
+
+enum {
+       HANA_OBS_NONE           = 0,
+
+       HANA_OBS_EVENTS,
+       HANA_OBS_HTTP_RSPS,
+};
+
 struct check_status {
        short result;                   /* one of SRV_CHK_* */
        char *info;                     /* human readable short info */
diff --git a/include/types/server.h b/include/types/server.h
index b3fe83d..b163190 100644
--- a/include/types/server.h
+++ b/include/types/server.h
@@ -115,7 +115,10 @@ struct server {
        struct sockaddr_in check_addr;          /* the address to check, if 
different from <addr> */
        short check_port;                       /* the port to use for the 
health checks */
        int health;                             /* 0->rise-1 = bad; 
rise->rise+fall-1 = good */
+       int consecutive_errors;                 /* */
        int rise, fall;                         /* time in iterations */
+       int consecutive_errors_limit;           /* */
+       short observe, onerror;                 /* */
        int inter, fastinter, downinter;        /* checks: time in milliseconds 
*/
        int slowstart;                          /* slowstart time in seconds 
(ms in the conf) */
        int result;                             /* health-check result : 
SRV_CHK_* */
@@ -137,7 +140,7 @@ struct server {
        unsigned down_time;                     /* total time the server was 
down */
        time_t last_change;                     /* last time, when the state 
was changed */
        struct timeval check_start;             /* last health check start time 
*/
-       unsigned long check_duration;           /* time in ms took to finish 
last health check */
+       long check_duration;                    /* time in ms took to finish 
last health check */
        short check_status, check_code;         /* check result, check code */
        char check_desc[HCHK_DESC_LEN];         /* healt check descritpion */
 
diff --git a/src/cfgparse.c b/src/cfgparse.c
index 428d7b9..889fc74 100644
--- a/src/cfgparse.c
+++ b/src/cfgparse.c
@@ -2541,6 +2541,8 @@ int cfg_parse_listen(const char *file, int linenum, char 
**args, int kwm)
                newsrv->uweight = newsrv->iweight = 1;
                newsrv->maxqueue = 0;
                newsrv->slowstart = 0;
+               newsrv->onerror = DEF_HANA_ONERR;
+               newsrv->consecutive_errors_limit = DEF_CELIMIT;
 
                cur_arg = 3;
                while (*args[cur_arg]) {
@@ -2746,6 +2748,66 @@ int cfg_parse_listen(const char *file, int linenum, char 
**args, int kwm)
                                do_check = 1;
                                cur_arg += 1;
                        }
+                       else if (!strcmp(args[cur_arg], "observe")) {
+                               if (!strcmp(args[cur_arg + 1], "none"))
+                                       newsrv->observe = HANA_OBS_NONE;
+                               else if (!strcmp(args[cur_arg + 1], "events"))
+                                       newsrv->observe = HANA_OBS_EVENTS;
+                               else if (!strcmp(args[cur_arg + 1], 
"httprsps")) {
+                                       if (curproxy->mode != PR_MODE_HTTP) {
+                                               Alert("parsing [%s:%d]: '%s' 
expects one of 'none', "
+                                                       "'events', 'httprsps' 
but get '%s'\n",
+                                                       file, linenum, 
args[cur_arg], args[cur_arg + 1]);
+                                               err_code |= ERR_ALERT;
+                                       }
+                                       newsrv->observe = HANA_OBS_HTTP_RSPS;
+                               }
+                               else {
+                                       Alert("parsing [%s:%d]: '%s' expects 
one of 'none', "
+                                               "'events', 'httprsps' but get 
'%s'\n",
+                                               file, linenum, args[cur_arg], 
args[cur_arg + 1]);
+                                       err_code |= ERR_ALERT | ERR_FATAL;
+                                       goto out;
+                               }
+
+                               cur_arg += 2;
+                       }
+                       else if (!strcmp(args[cur_arg], "onerror")) {
+                               if (!strcmp(args[cur_arg + 1], "fastinter"))
+                                       newsrv->onerror = HANA_ONERR_FASTINTER;
+                               else if (!strcmp(args[cur_arg + 1], 
"failcheck"))
+                                       newsrv->onerror = HANA_ONERR_FAILCHK;
+                               else if (!strcmp(args[cur_arg + 1], 
"suddendeath"))
+                                       newsrv->onerror = HANA_ONERR_SUDDTH;
+                               else if (!strcmp(args[cur_arg + 1], "markdown"))
+                                       newsrv->onerror = HANA_ONERR_MARKDWN;
+                               else {
+                                       Alert("parsing [%s:%d]: '%s' expects 
one of 'fastinter', "
+                                               "'failcheck', 'suddendeath' or 
'godown' but get '%s'\n",
+                                               file, linenum, args[cur_arg], 
args[cur_arg + 1]);
+                                       err_code |= ERR_ALERT | ERR_FATAL;
+                                       goto out;
+                               }
+
+                               cur_arg += 2;
+                       }
+                       else if (!strcmp(args[cur_arg], "allowed_error")) {
+                               if (!*args[cur_arg + 1]) {
+                                       Alert("parsing [%s:%d]: '%s' expects an 
integer argument.\n",
+                                               file, linenum, args[cur_arg]);
+                                       err_code |= ERR_ALERT | ERR_FATAL;
+                                       goto out;
+                               }
+
+                               newsrv->consecutive_errors_limit = 
atoi(args[cur_arg + 1]);
+
+                               if (newsrv->consecutive_errors_limit <= 0) {
+                                       Alert("parsing [%s:%d]: %s has to be > 
0.\n",
+                                               file, linenum, args[cur_arg]);
+                                       err_code |= ERR_ALERT | ERR_FATAL;
+                                       goto out;
+                               }
+                       }
                        else if (!strcmp(args[cur_arg], "source")) {  /* 
address to which we bind when connecting */
                                int port_low, port_high;
                                if (!*args[cur_arg + 1]) {
diff --git a/src/checks.c b/src/checks.c
index 0e9aca5..61bac4c 100644
--- a/src/checks.c
+++ b/src/checks.c
@@ -52,6 +52,8 @@ const struct check_status check_statuses[HCHK_STATUS_SIZE] = {
        [HCHK_STATUS_INI]       = { SRV_CHK_UNKNOWN,                   "INI",   
  "Initializing" },
        [HCHK_STATUS_START]     = { /* SPECIAL STATUS*/ },
 
+       [HCHK_STATUS_HANA]      = { SRV_CHK_ERROR,                     "HANA",  
  "Health analyze" },
+
        [HCHK_STATUS_SOCKERR]   = { SRV_CHK_ERROR,                     
"SOCKERR", "Socket error" },
 
        [HCHK_STATUS_L4OK]      = { SRV_CHK_RUNNING,                   "L4OK",  
  "Layer4 check passed" },
@@ -72,6 +74,21 @@ const struct check_status check_statuses[HCHK_STATUS_SIZE] = 
{
        [HCHK_STATUS_L7STS]     = { SRV_CHK_ERROR,                     "L7STS", 
  "Layer7 wrong status" },
 };
 
+const char *analyze_statuses[HANA_SIZE] = {
+       [HANA_UNKNOWN]          = "Unknown",
+
+       [HANA_TCP_OK]           = "Correct tcp response",
+
+       [HANA_HTTP_OK]          = "Correct http response",
+       [HANA_HTTP_STS]         = "Wrong http response",
+       [HANA_HTTP_HDRRSP]      = "Invalid http response (headers)",
+       [HANA_HTTP_RSP]         = "Invalid http response",
+
+       [HANA_READ_ERROR]       = "Read error",
+       [HANA_READ_TIMEOUT]     = "Read timeout",
+       [HANA_BROKEN_PIPE]      = "Close from server",
+};
+
 /*
  * Convert check_status code to description
  */
@@ -108,6 +125,21 @@ const char *get_check_status_info(short check_status) {
                return check_statuses[HCHK_STATUS_UNKNOWN].info;
 }
 
+const char *get_analyze_status(short analyze_status) {
+
+       const char *desc;
+
+       if (analyze_status < HANA_SIZE)
+               desc = analyze_statuses[analyze_status];
+       else
+               desc = NULL;
+
+       if (desc && *desc)
+               return desc;
+       else
+               return analyze_statuses[HANA_UNKNOWN];
+}
+
 #define SSP_O_VIA      0x0001
 #define SSP_O_HCHK     0x0002
 #define SSP_O_STATUS   0x0004
@@ -136,7 +168,8 @@ static void server_status_printf(struct chunk *msg, struct 
server *s, unsigned o
                        chunk_printf(msg, "\"");
                }
 
-               chunk_printf(msg, ", check duration: %lums", s->check_duration);
+               if (s->check_duration >= 0)
+                       chunk_printf(msg, ", check duration: %ldms", 
s->check_duration);
        }
 
        if (options & SSP_O_STATUS) {
@@ -184,13 +217,14 @@ static void set_server_check_status(struct server *s, 
short status, char *desc)
 
        s->check_status = status;
        if (check_statuses[status].result)
-               s->result |= check_statuses[status].result;
+               s->result = check_statuses[status].result;
 
        if (!tv_iszero(&s->check_start)) {
                /* set_server_check_status() may be called more than once */
                s->check_duration = tv_ms_elapsed(&s->check_start, &now);
                tv_zero(&s->check_start);
-       }
+       } else
+               s->check_duration = -1;
 
        if (s->proxy->options2 & PR_O2_LOGHCHKS &&
        (((s->health != 0) && (s->result & SRV_CHK_ERROR)) ||
@@ -229,6 +263,10 @@ static void set_server_check_status(struct server *s, 
short status, char *desc)
                                if (health >= rise)
                                        health = rise + fall - 1; /* OK now */
                        }
+
+                       /* clear consecutive_errors if observing is enabled */
+                       if (s->onerror)
+                               s->consecutive_errors = 0;
                }
                /* FIXME end: calculate local version of the 
health/rise/fall/state */
 
@@ -505,6 +543,102 @@ static void set_server_enabled(struct server *s) {
                        set_server_enabled(srv);
 }
 
+void halth_analyze(struct server *s, short status) {
+
+       int failed = -1;
+       int expire;
+
+       /* return if observing or healt check is not enabled */
+       if (!s->observe || !s->check)
+               return;
+
+       switch (status) {
+               case HANA_TCP_OK:
+               case HANA_HTTP_OK:
+                       failed = 0;
+                       break;
+
+               case HANA_HTTP_STS:
+                       if (s->observe == HANA_OBS_HTTP_RSPS)
+                               failed = 1;
+                       else
+                               failed = 0;
+                       break;
+
+               case HANA_HTTP_HDRRSP:
+               case HANA_HTTP_RSP:
+               case HANA_READ_ERROR:
+               case HANA_READ_TIMEOUT:
+               case HANA_BROKEN_PIPE:
+                       failed = 1;
+
+               default:
+                       break;
+       }
+
+       /* unknown status, ignore it */
+       if (failed < 0)
+               return;
+
+       if (!failed) {
+               /* good, clear consecutive_errors */
+               s->consecutive_errors = 0;
+               return;
+       }
+
+       s->consecutive_errors++;
+
+       if (s->consecutive_errors < s->consecutive_errors_limit)
+               return;
+
+       sprintf(trash, "Detected %d consecutive errors, last one was: %s",
+               s->consecutive_errors, get_analyze_status(status));
+
+       switch (s->onerror) {
+               case HANA_ONERR_FASTINTER:
+               /* force fastinter - nothing to do here as all modes force it */
+                       break;
+
+               case HANA_ONERR_SUDDTH:
+               /* simulate a pre-fatal failed health check */
+                       if (s->health > s->rise)
+                               s->health = s->rise + 1;
+
+                       /* no break - fall through */
+
+               case HANA_ONERR_FAILCHK:
+               /* simulate a failed health check */
+                       set_server_check_status(s, HCHK_STATUS_HANA, trash);
+
+                       if (s->health > s->rise) {
+                               s->health--; /* still good */
+                               s->counters.failed_checks++;
+                       }
+                       else
+                               set_server_down(s);
+
+                       break;
+
+               case HANA_ONERR_MARKDWN:
+               /* mark server down */
+                       s->health = s->rise;
+                       set_server_check_status(s, HCHK_STATUS_HANA, trash);
+                       set_server_down(s);
+
+                       break;
+
+               default:
+                       /* write warning? */
+                       break;
+       }
+
+       s->consecutive_errors = 0;
+
+       expire = tick_add(now_ms, MS_TO_TICKS(s->fastinter));
+       if (s->check->expire > expire)
+               s->check->expire = expire;
+}
+
 /*
  * This function is used only for server health-checks. It handles
  * the connection acknowledgement. If the proxy requires L7 health-checks,
diff --git a/src/dumpstats.c b/src/dumpstats.c
index 522c3f8..d23602a 100644
--- a/src/dumpstats.c
+++ b/src/dumpstats.c
@@ -1545,7 +1545,7 @@ int stats_dump_proxy(struct session *s, struct proxy *px, 
struct uri_auth *uri)
                                        if (sv->check_status >= 
HCHK_STATUS_L57DATA)
                                                chunk_printf(&msg, "/%d", 
sv->check_code);
 
-                                       if (sv->check_status >= 
HCHK_STATUS_CHECKED)
+                                       if (sv->check_status >= 
HCHK_STATUS_CHECKED && sv->check_duration >= 0)
                                                chunk_printf(&msg, " in %lums", 
sv->check_duration);
                                } else {
                                        chunk_printf(&msg, "</td><td>");
diff --git a/src/proto_http.c b/src/proto_http.c
index 2300738..4d8f09d 100644
--- a/src/proto_http.c
+++ b/src/proto_http.c
@@ -41,6 +41,7 @@
 #include <proto/acl.h>
 #include <proto/backend.h>
 #include <proto/buffers.h>
+#include <proto/checks.h>
 #include <proto/client.h>
 #include <proto/dumpstats.h>
 #include <proto/fd.h>
@@ -2857,8 +2858,10 @@ int process_response(struct session *t)
 
                                buffer_shutr_now(rep);
                                buffer_shutw_now(req);
-                               if (t->srv)
+                               if (t->srv) {
                                        t->srv->counters.failed_resp++;
+                                       halth_analyze(t->srv, HANA_HTTP_HDRRSP);
+                               }
                                t->be->counters.failed_resp++;
                                rep->analysers = 0;
                                txn->status = 502;
@@ -2880,8 +2883,10 @@ int process_response(struct session *t)
                                        
http_capture_bad_message(&t->be->invalid_rep, t, rep, msg, t->fe);
                                buffer_shutr_now(rep);
                                buffer_shutw_now(req);
-                               if (t->srv)
+                               if (t->srv) {
                                        t->srv->counters.failed_resp++;
+                                       halth_analyze(t->srv, HANA_READ_ERROR);
+                               }
                                t->be->counters.failed_resp++;
                                rep->analysers = 0;
                                txn->status = 502;
@@ -2898,8 +2903,10 @@ int process_response(struct session *t)
                                        
http_capture_bad_message(&t->be->invalid_rep, t, rep, msg, t->fe);
                                buffer_shutr_now(rep);
                                buffer_shutw_now(req);
-                               if (t->srv)
+                               if (t->srv) {
                                        t->srv->counters.failed_resp++;
+                                       halth_analyze(t->srv, 
HANA_READ_TIMEOUT);
+                               }
                                t->be->counters.failed_resp++;
                                rep->analysers = 0;
                                txn->status = 504;
@@ -2915,8 +2922,10 @@ int process_response(struct session *t)
                                if (msg->err_pos >= 0)
                                        
http_capture_bad_message(&t->be->invalid_rep, t, rep, msg, t->fe);
                                buffer_shutw_now(req);
-                               if (t->srv)
+                               if (t->srv) {
                                        t->srv->counters.failed_resp++;
+                                       halth_analyze(t->srv, HANA_BROKEN_PIPE);
+                               }
                                t->be->counters.failed_resp++;
                                rep->analysers = 0;
                                txn->status = 502;
@@ -2971,6 +2980,20 @@ int process_response(struct session *t)
                if (n < 1 || n > 5)
                        n = 0;
 
+               switch(n) {
+                       case 1:
+                       case 2:
+                       case 3:
+                       case 4:
+                               halth_analyze(t->srv, HANA_HTTP_OK);
+                               break;
+
+                       case 5:
+                       default:
+                               halth_analyze(t->srv, HANA_HTTP_STS);
+                               break;
+               }
+
                t->srv->counters.p.http.rsp[n]++;
                t->be->counters.p.http.rsp[n]++;
 
@@ -3027,8 +3050,10 @@ int process_response(struct session *t)
                        if (rule_set->rsp_exp != NULL) {
                                if (apply_filters_to_response(t, rep, 
rule_set->rsp_exp) < 0) {
                                return_bad_resp:
-                                       if (t->srv)
+                                       if (t->srv) {
                                                t->srv->counters.failed_resp++;
+                                               halth_analyze(t->srv, 
HANA_HTTP_RSP);
+                                       }
                                        cur_proxy->counters.failed_resp++;
                                return_srv_prx_502:
                                        buffer_shutr_now(rep);

Reply via email to