In the case where agent-port is used and the agent
check is a secondary check to not mark a server as down
if the agent becomes unavailable.

In this configuration the agent should only cause a server to be marked
as down if the agent returns "fail", "stopped" or "down".

Signed-off-by: Simon Horman <ho...@verge.net.au>

---

v4
* Never update health of agent on failure to connect

  Previously the health was recharged on such cases.
  But this causes the health to be recharged even
  if the most recent result from the agent was "fail", "stopped", or "down".

  The result was unnecessary logging of "fail", "stopped", or "down"
  results from the agent if they were interleaved with timeouts.
  Or in other words, if the agent responded only occasionally
  but consistently with "fail", "stopped", or "down".

v3
* No change

v2
* Only log agent server status if the change in status
  will affect the server's state
---
 doc/configuration.txt  |  3 ++
 include/types/server.h |  5 ++--
 src/checks.c           | 77 +++++++++++++++++++++++++++++++++-----------------
 3 files changed, 57 insertions(+), 28 deletions(-)

diff --git a/doc/configuration.txt b/doc/configuration.txt
index 233be4e..40675f4 100644
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@@ -7707,6 +7707,9 @@ agent-port <port>
   using the "port" parameter. A third scenario is to use "http-check
   agent-hdr" to run an agent and http checks simultaneously over HTTP.
 
+  When run in this manner failure to connect to the agent is not
+  considered an error as connectivity is tested by the primary check.
+
   See also the "agent-inter" parameter.
 
 backup
diff --git a/include/types/server.h b/include/types/server.h
index dd17d61..f316e01 100644
--- a/include/types/server.h
+++ b/include/types/server.h
@@ -52,8 +52,9 @@
 #define SRV_GOINGDOWN  0x0020  /* this server says that it's going down (404) 
*/
 #define SRV_WARMINGUP  0x0040  /* this server is warming up after a failure */
 #define SRV_MAINTAIN   0x0080  /* this server is in maintenance mode */
-#define SRV_DRAIN      0x0100  /* this server has been requested to drain its 
connections */
-/* unused: 0x0200, 0x0400 */
+#define SRV_WILLDRAIN  0x0100  /* this server has been requested to drain its 
connections but has not started doing so yet */
+#define SRV_DRAIN      0x0200  /* this been requested to drain its connections 
*/
+/* unused: 0x0400 */
 #define SRV_SEND_PROXY 0x0800  /* this server talks the PROXY protocol */
 #define SRV_NON_STICK  0x1000  /* never add connections allocated to this 
server to a stick table */
 
diff --git a/src/checks.c b/src/checks.c
index 706ed31..88a34ea 100644
--- a/src/checks.c
+++ b/src/checks.c
@@ -228,11 +228,19 @@ static void set_server_check_status(struct check *check, 
short status, const cha
                tv_zero(&check->start);
        }
 
+       /* Failure to connect to the agent as a secondary check should not
+        * cause the server to be marked down. So only log status changes
+        * for HCHK_STATUS_* statuses */
+       if (check == &s->agent && check->status < HCHK_STATUS_L7TOUT)
+               return;
+
        if (s->proxy->options2 & PR_O2_LOGHCHKS &&
        (((check->health != 0) && (check->result & SRV_CHK_FAILED)) ||
            ((check->health != s->rise + s->fall - 1) && (check->result & 
SRV_CHK_PASSED)) ||
            ((s->state & SRV_GOINGDOWN) && !(check->result & SRV_CHK_DISABLE)) 
||
-           (!(s->state & SRV_GOINGDOWN) && (check->result & 
SRV_CHK_DISABLE)))) {
+           (!(s->state & SRV_GOINGDOWN) && (check->result & SRV_CHK_DISABLE)) 
||
+           ((s->state & SRV_WILLDRAIN) && !(s->state & SRV_DRAIN)) ||
+           (!(s->state & SRV_WILLDRAIN) && (s->state & SRV_DRAIN)))) {
 
                int health, rise, fall, state;
 
@@ -284,7 +292,8 @@ static void set_server_check_status(struct check *check, 
short status, const cha
                chunk_appendf(&trash, ", status: %d/%d %s",
                             (state & SRV_RUNNING) ? (health - rise + 1) : 
(health),
                             (state & SRV_RUNNING) ? (fall) : (rise),
-                            (state & SRV_RUNNING)?"UP":"DOWN");
+                            (state & SRV_RUNNING) ?
+                            ((state & SRV_WILLDRAIN) ? "DRAIN" : "UP") : 
"DOWN");
 
                Warning("%s.\n", trash.str);
                send_log(s->proxy, LOG_NOTICE, "%s.\n", trash.str);
@@ -611,6 +620,27 @@ static void set_server_enabled(struct server *s) {
                        set_server_enabled(srv);
 }
 
+static void check_failed(struct check *check)
+{
+       struct server *s = check->server;
+
+       /* The agent secondary check should only cause a server to be marked
+        * as down if check->status is HCHK_STATUS_L7STS, which indicates
+        * that the agent returned "fail", "stopped" or "down".
+        * The implication here is that failure to connect to the agent
+        * as a secondary check should not cause the server to be marked
+        * down. */
+       if (check == &s->agent && check->status != HCHK_STATUS_L7STS)
+               return;
+
+       if (check->health > s->rise) {
+               check->health--; /* still good */
+               s->counters.failed_checks++;
+       }
+       else
+               set_server_down(check);
+}
+
 void health_adjust(struct server *s, short status)
 {
        int failed;
@@ -668,13 +698,7 @@ void health_adjust(struct server *s, short status)
                case HANA_ONERR_FAILCHK:
                /* simulate a failed health check */
                        set_server_check_status(&s->check, HCHK_STATUS_HANA, 
trash.str);
-
-                       if (s->check.health > s->rise) {
-                               s->check.health--; /* still good */
-                               s->counters.failed_checks++;
-                       }
-                       else
-                               set_server_down(&s->check);
+                       check_failed(&s->check);
 
                        break;
 
@@ -854,11 +878,6 @@ static void agent_expect(struct check *check, char *data)
                down_cmd = "fail";
        }
 
-       if (drain)
-               check->server->state |= SRV_DRAIN;
-       else
-               check->server->state &= ~SRV_DRAIN;
-
        if (down_cmd) {
                const char *end = data + strlen(down_cmd);
                /*
@@ -871,7 +890,23 @@ static void agent_expect(struct check *check, char *data)
                }
        }
 
+       /* Signal intended drain state.
+        * A miss-match between the SRV_WILLDRAIN and SRV_DRAIN bits
+        * is used by set_server_check_status to log changes
+        * in state */
+       if (drain)
+               check->server->state |= SRV_WILLDRAIN;
+       else if (!down_cmd)
+               check->server->state &= ~SRV_WILLDRAIN;
+
        set_server_check_status(check, status, desc);
+
+        /* Set or clear SRV_DRAIN now that any change in drain
+         * state has been detected by set_server_check_status(). */
+       if (drain)
+               check->server->state |= SRV_DRAIN;
+       else if (!down_cmd)
+               check->server->state &= ~SRV_DRAIN;
 }
 
 /*
@@ -1315,12 +1350,7 @@ static void process_result(struct check *check)
        struct server *s = check->server;
 
        if (check->result & SRV_CHK_FAILED) {    /* a failure or timeout 
detected */
-               if (check->health > s->rise) {
-                       check->health--; /* still good */
-                       s->counters.failed_checks++;
-               }
-               else
-                       set_server_down(check);
+               check_failed(check);
        }
        else {  /* check was OK */
                /* we may have to add/remove this server from the LB group */
@@ -1462,12 +1492,7 @@ static struct task *process_chk(struct task *t)
                /* here, we have seen a synchronous error, no fd was allocated 
*/
 
                check->state &= ~CHK_RUNNING;
-               if (check->health > s->rise) {
-                       check->health--; /* still good */
-                       s->counters.failed_checks++;
-               }
-               else
-                       set_server_down(check);
+               check_failed(check);
 
                /* we allow up to min(inter, timeout.connect) for a connection
                 * to establish but only when timeout.check is set
-- 
1.8.3.2


Reply via email to