I've been trying to understand a change of behaviour between v2.2dev5 and
v2.2dev6. Indeed our probe is regularly testing to add and remove
servers on a given backend such as:
# echo "show servers state be_foo" | sudo socat stdio /var/lib/haproxy/stats
113 be_foo 1 srv0 10.236.139.34 2 0 1 1 263 15 3 4 6 0 0 0 - 31255 -
113 be_foo 2 srv1 0.0.0.0 0 1 256 256 0 15 3 0 14 0 0 0 - 0 -
-> curl on the corresponding frontend: reply from server:31255
# echo "set server be_foo/srv1 addr 10.236.139.34 port 31257" | sudo socat
stdio /var/lib/haproxy/stats
IP changed from '0.0.0.0' to '10.236.139.34', port changed from '0' to '31257'
by 'stats socket command'
# echo "set server be_foo/srv1 weight 256" | sudo socat stdio
/var/lib/haproxy/stats
# echo "set server be_foo/srv1 check-port 8500" | sudo socat stdio
/var/lib/haproxy/stats
health check port updated.
# echo "set server be_foo/srv1 state ready" | sudo socat stdio
/var/lib/haproxy/stats
# echo "show servers state be_foo" | sudo socat stdio /var/lib/haproxy/stats
113 be_foo 1 srv0 10.236.139.34 2 0 1 1 105 15 3 4 6 0 0 0 - 31255 -
113 be_foo 2 srv1 10.236.139.34 2 0 256 256 2319 15 3 2 6 0 0 0 - 31257 -
-> curl on the corresponding frontend: reply for server:31257
(notice the difference of weight)
# echo "set server be_foo/srv1 state maint" | sudo socat stdio
/var/lib/haproxy/stats
# echo "set server be_foo/srv1 addr 0.0.0.0 port 0" | sudo socat stdio
/var/lib/haproxy/stats
IP changed from '10.236.139.34' to '0.0.0.0', port changed from '31257' to '0'
by 'stats socket command'
# echo "show servers state be_foo" | sudo socat stdio /var/lib/haproxy/stats
113 be_foo 1 srv0 10.236.139.34 2 0 1 1 263 15 3 4 6 0 0 0 - 31255 -
113 be_foo 2 srv1 0.0.0.0 0 1 256 256 0 15 3 0 14 0 0 0 - 0 -
-> curl on the corresponding frontend: reply from server:31255
# echo "set server be_foo/srv1 addr 10.236.139.34 port 31256" | sudo socat
stdio /var/lib/haproxy/stats
IP changed from '0.0.0.0' to '10.236.139.34', port changed from '0' to '31256'
by 'stats socket command'
# echo "set server be_foo/srv1 weight 256" | sudo socat stdio
/var/lib/haproxy/stats
# echo "set server be_foo/srv1 check-port 8500" | sudo socat stdio
/var/lib/haproxy/stats
health check port updated.
# echo "set server be_foo/srv1 state ready" | sudo socat stdio
/var/lib/haproxy/stats
# echo "show servers state be_foo" | sudo socat stdio /var/lib/haproxy/stats
113 be_foo 1 srv0 10.236.139.34 2 0 1 1 105 15 3 4 6 0 0 0 - 31255 -
113 be_foo 2 srv1 10.236.139.34 2 0 256 256 2319 15 3 2 6 0 0 0 - 31256 -
-> curl on the corresponding frontend: reply from server:31257 (!)
Here we indeed would expect to get an anver from server:31256. The issue
is highly linked to the usage of `pool-purge-delay`, with a value which
is higher than the duration of the test, 10s in our case.
a git bisect between dev5 and dev6 seems to show commit
079cb9af22da6 ("MEDIUM: connections: Revamp the way idle connections are
killed")
being the origin of this new behaviour.
So if I understand the later correctly, it seems that it was more a
matter of chance that we did not saw the issue earlier.
My patch proposes to force clean idle connections in the two following
cases:
- we set a (still running) server to maintenance
- we change the ip/port of a server
A backport is not needed as the commit revealing the issue is present
only on v2.2 branch only.
Fixes: 079cb9af22da6 ("MEDIUM: connections: Revamp the way idle
connections are killed")
Signed-off-by: William Dauchy <[email protected]>
---
include/proto/server.h | 1 +
src/server.c | 37 ++++++++++++++++++++++++++++++++++++-
2 files changed, 37 insertions(+), 1 deletion(-)
diff --git a/include/proto/server.h b/include/proto/server.h
index f8fed3148..388c7e2ff 100644
--- a/include/proto/server.h
+++ b/include/proto/server.h
@@ -65,6 +65,7 @@ const char *update_server_fqdn(struct server *server, const
char *fqdn, const ch
int snr_resolution_cb(struct dns_requester *requester, struct dns_nameserver
*nameserver);
int snr_resolution_error_cb(struct dns_requester *requester, int error_code);
struct server *snr_check_ip_callback(struct server *srv, void *ip, unsigned
char *ip_family);
+void srv_cleanup_connections(struct server *srv);
struct task *srv_cleanup_idle_connections(struct task *task, void *ctx,
unsigned short state);
struct task *srv_cleanup_toremove_connections(struct task *task, void
*context, unsigned short state);
diff --git a/src/server.c b/src/server.c
index 4d5e706d3..751439d76 100644
--- a/src/server.c
+++ b/src/server.c
@@ -3669,8 +3669,11 @@ const char *update_server_addr_port(struct server *s,
const char *addr, const ch
}
out:
- if (changed)
+ if (changed) {
+ /* force connection cleanup on the given server */
+ srv_cleanup_connections(s);
srv_set_dyncookie(s);
+ }
if (updater)
chunk_appendf(msg, " by '%s'", updater);
chunk_appendf(msg, "\n");
@@ -4832,6 +4835,8 @@ static void srv_update_status(struct server *s)
if (s->onmarkeddown &
HANA_ONMARKEDDOWN_SHUTDOWNSESSIONS)
srv_shutdown_streams(s, SF_ERR_DOWN);
+ /* force connection cleanup on the given server */
+ srv_cleanup_connections(s);
/* we might have streams queued on this server and
waiting for
* a connection. Those which are redispatchable will be
queued
* to another server or to the proxy itself.
@@ -5160,6 +5165,36 @@ struct task *srv_cleanup_toremove_connections(struct
task *task, void *context,
return task;
}
+/* cleanup connections for a given server
+ * might be useful when going on forced maintenance or live changing ip/port
+ */
+void srv_cleanup_connections(struct server *srv)
+{
+ struct connection *conn;
+ int did_remove;
+ int i;
+ int j;
+
+ HA_SPIN_LOCK(OTHER_LOCK, &idle_conn_srv_lock);
+ for (i = 0; i < global.nbthread; i++) {
+ did_remove = 0;
+ HA_SPIN_LOCK(OTHER_LOCK, &toremove_lock[i]);
+ for (j = 0; j < srv->curr_idle_conns; j++) {
+ conn = MT_LIST_POP(&srv->idle_conns[i], struct
connection *, list);
+ if (!conn)
+ conn = MT_LIST_POP(&srv->safe_conns[i],
+ struct connection *, list);
+ if (!conn)
+ break;
+ MT_LIST_ADDQ(&toremove_connections[i], (struct mt_list
*)&conn->list);
+ }
+ HA_SPIN_UNLOCK(OTHER_LOCK, &toremove_lock[i]);
+ if (did_remove)
+ task_wakeup(idle_conn_cleanup[i], TASK_WOKEN_OTHER);
+ }
+ HA_SPIN_UNLOCK(OTHER_LOCK, &idle_conn_srv_lock);
+}
+
struct task *srv_cleanup_idle_connections(struct task *task, void *context,
unsigned short state)
{
struct server *srv;
--
2.26.2