Hi Willy,

My original concern was that if two servers had values of lb_server_key
that are close to each other, then there could be some overlap in their
values of lb_server_key + node_index;, which is why I was looking for a
reasonable hash function to combine them. I based it on boost::hash_combine,
but since writing that I came across https://stackoverflow.com/a/50978188
explaining why boost::hash_combine isn't necessarily a good hash function.

I agree that it's not too critical to have a few collisions, but I'm reaching
the limits of my knowledge here. Does this change address your concerns?
I've attached an updated patch, and here's the diff between them:

diff --git a/src/lb_chash.c b/src/lb_chash.c
index c77222854..d7a1b2539 100644
--- a/src/lb_chash.c
+++ b/src/lb_chash.c
@@ -66,9 +66,7 @@ static inline void chash_dequeue_srv(struct server *s)
  */
 static inline u32 chash_compute_node_key(struct server *s, unsigned node_index)
 {
-    u32 server_key = s->lb_server_key;
-    u32 index_key = full_hash(node_index);
-    return server_key ^= index_key + 0x9e3779b9 + (server_key << 6) +
(server_key >> 2);
+    return full_hash(s->lb_server_key + node_index);
 }

 /* Compute the base server key that will be used to compute node keys. Servers
@@ -112,8 +110,7 @@ static inline u32 chash_compute_server_key(struct server *s)
     case SRV_HASH_KEY_ADDR:
         switch (srv_addr.family) {
         case AF_INET:
-            u32 addr_key = full_hash(srv_addr.addr.v4.s_addr);
-            key ^= addr_key + 0x9e3779b9 + (key << 6) + (key >> 2);
+            key = full_hash(key + srv_addr.addr.v4.s_addr);
             break;
         case AF_INET6:
             key = XXH32(srv_addr.addr.v6.s6_addr, 16, key);


Thanks,
Anthony
From 3280a39664bf913807200f21d1b952856966d141 Mon Sep 17 00:00:00 2001
From: Anthony Deschamps <anthony.j.descha...@gmail.com>
Date: Fri, 16 Feb 2024 16:00:35 -0500
Subject: [PATCH] MEDIUM: lb-chash: Deterministic node hashes based on server
 address

Motivation: When services are discovered through DNS resolution, the order in
which DNS records get resolved and assigned to servers is arbitrary. Therefore,
even though two HAProxy instances using chash balancing might agree that a
particular request should go to server3, it is likely the case that they have
assigned different IPs and ports to the server in that slot.

This patch adds a server option, "hash-key <key>" which can be set to "id" (the
existing behaviour, defaut), "addr", or "addr-port". By deriving the keys for
the chash tree nodes from a server's address and port we ensure that independent
HAProxy instances will agree on routing decisions. If an address is not known
then the key is derived from the server's puid as it was previously.

When adjusting a server's weight, we now unconditionally remove all nodes first,
since the server's address (and therefore the desired node keys) may have changed.
---
 doc/configuration.txt      | 24 ++++++++++
 include/haproxy/server-t.h |  9 ++++
 src/lb_chash.c             | 90 +++++++++++++++++++++++++++++++++++---
 src/server.c               | 31 +++++++++++++
 4 files changed, 148 insertions(+), 6 deletions(-)

diff --git a/doc/configuration.txt b/doc/configuration.txt
index 1065e6098..052d41f9f 100644
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@@ -4813,6 +4813,8 @@ error-log-format                          X          X         X         -
 force-persist                             -          -         X         X
 filter                                    -          X         X         X
 fullconn                                  X          -         X         X
+hash-balance-factor                       X          -         X         X
+hash-key                                  X          -         X         X
 hash-type                                 X          -         X         X
 http-after-response                       X (!)      X         X         X
 http-check comment                        X          -         X         X
@@ -6606,6 +6608,28 @@ hash-balance-factor <factor>
   See also : "balance" and "hash-type".
 
 
+hash-key <key>
+  Specify how "hash-type consistent" node keys are computed
+
+  Arguments :
+    <key>   <key> may be one of the following :
+
+      id         The node keys will be derived from the server's position in
+                 the server list.
+
+      addr       The node keys will be derived from the server's address, when
+                 available, or else fall back on "id".
+
+      addr-port  The node keys will be derived from the server's address and
+                 port, when available, or else fall back on "id".
+
+  The "addr" and "addr-port" options may be useful in scenarios where multiple
+  HAProxy processes are balancing traffic to the same set of servers. If the
+  server order of each process is different (because, for example, DNS records
+  were resolved in different orders) then this will allow each independent
+  HAProxy processes to agree on routing decisions.
+
+
 hash-type <method> <function> <modifier>
   Specify a method to use for mapping hashes to servers
 
diff --git a/include/haproxy/server-t.h b/include/haproxy/server-t.h
index f077ff2f8..811eae712 100644
--- a/include/haproxy/server-t.h
+++ b/include/haproxy/server-t.h
@@ -223,6 +223,13 @@ struct pid_list {
 	int exited;
 };
 
+/* srv methods of computing chash keys */
+enum srv_hash_key {
+	SRV_HASH_KEY_ID = 0,         /* derived from server puid */
+	SRV_HASH_KEY_ADDR,           /* derived from server address */
+	SRV_HASH_KEY_ADDR_PORT       /* derived from server address and port */
+};
+
 /* A tree occurrence is a descriptor of a place in a tree, with a pointer back
  * to the server itself.
  */
@@ -366,6 +373,8 @@ struct server {
 	struct tree_occ *lb_nodes;              /* lb_nodes_tot * struct tree_occ */
 	unsigned lb_nodes_tot;                  /* number of allocated lb_nodes (C-HASH) */
 	unsigned lb_nodes_now;                  /* number of lb_nodes placed in the tree (C-HASH) */
+	enum srv_hash_key hash_key;             /* method to compute node hash (C-HASH) */
+	unsigned lb_server_key;                 /* hash of the values indicated by "hash_key" (C-HASH) */
 
 	const struct netns_entry *netns;        /* contains network namespace name or NULL. Network namespace comes from configuration */
 	struct xprt_ops *xprt;                  /* transport-layer operations */
diff --git a/src/lb_chash.c b/src/lb_chash.c
index 1c8034d89..d7a1b2539 100644
--- a/src/lb_chash.c
+++ b/src/lb_chash.c
@@ -21,8 +21,9 @@
 #include <haproxy/backend.h>
 #include <haproxy/errors.h>
 #include <haproxy/queue.h>
-#include <haproxy/server-t.h>
+#include <haproxy/server.h>
 #include <haproxy/tools.h>
+#include <haproxy/xxhash.h>
 
 /* Return next tree node after <node> which must still be in the tree, or be
  * NULL. Lookup wraps around the end to the beginning. If the next node is the
@@ -58,6 +59,76 @@ static inline void chash_dequeue_srv(struct server *s)
 	}
 }
 
+/* Compute a key that can be used to insert a node into the CHASH tree. Servers
+ * have a base key, which can be computed in several ways (see
+ * chash_compute_server_key) and this function uses that seed to generate hash
+ * keys for however many nodes need to be inserted into the tree.
+ */
+static inline u32 chash_compute_node_key(struct server *s, unsigned node_index)
+{
+	return full_hash(s->lb_server_key + node_index);
+}
+
+/* Compute the base server key that will be used to compute node keys. Servers
+ * may be configured to determine their hashes either from their ID, address, or
+ * address+port; the latter options allow independent HAProxy instances to agree
+ * on routing decisions, regardless of their order in the server list (which may
+ * be arbitrary, since it could depend on factors such as the order of entries
+ * in a DNS SRV record). If an address is not known or if the server is
+ * configured with `hash-key id` (the default) then the key will be determined
+ * from the server's puid.
+ */
+static inline u32 chash_compute_server_key(struct server *s)
+{
+	u32 key = 0;
+	struct server_inetaddr srv_addr;
+
+	enum srv_hash_key hash_key = s->hash_key;
+
+	/* If hash-key is addr or addr-port then we need the address, but if we
+	 * can't determine the address then we fall back on hashing the puid.
+	 */
+	switch (s->hash_key) {
+	case SRV_HASH_KEY_ADDR:
+	case SRV_HASH_KEY_ADDR_PORT:
+		server_get_inetaddr(s, &srv_addr);
+		if (srv_addr.family != AF_INET && srv_addr.family != AF_INET6) {
+			hash_key = SRV_HASH_KEY_ID;
+		}
+		break;
+
+	default:
+		break;
+	}
+
+	if (hash_key == SRV_HASH_KEY_ADDR_PORT) {
+		key = full_hash(srv_addr.port.svc);
+	}
+
+	switch (hash_key) {
+	case SRV_HASH_KEY_ADDR_PORT:
+	case SRV_HASH_KEY_ADDR:
+		switch (srv_addr.family) {
+		case AF_INET:
+			key = full_hash(key + srv_addr.addr.v4.s_addr);
+			break;
+		case AF_INET6:
+			key = XXH32(srv_addr.addr.v6.s6_addr, 16, key);
+			break;
+		default:
+			break;
+		}
+		break;
+
+	case SRV_HASH_KEY_ID:
+	default:
+		key = full_hash(s->puid);
+		break;
+	}
+
+	return key;
+}
+
 /* Adjust the number of entries of a server in its tree. The server must appear
  * as many times as its weight indicates it. If it's there too often, we remove
  * the last occurrences. If it's not there enough, we add more occurrences. To
@@ -67,6 +138,15 @@ static inline void chash_dequeue_srv(struct server *s)
  */
 static inline void chash_queue_dequeue_srv(struct server *s)
 {
+	u32 server_key = chash_compute_server_key(s);
+
+	/* If the server key changed then we must rehash all the nodes. */
+	if (server_key != s->lb_server_key) {
+		chash_dequeue_srv(s);
+		s->lb_nodes_tot = 0;
+		s->lb_server_key = server_key;
+	}
+
 	while (s->lb_nodes_now > s->next_eweight) {
 		if (s->lb_nodes_now >= s->lb_nodes_tot) // should always be false anyway
 			s->lb_nodes_now = s->lb_nodes_tot;
@@ -95,7 +175,7 @@ static inline void chash_queue_dequeue_srv(struct server *s)
 			    (s->next_eweight - s->lb_nodes_tot) * sizeof(*s->lb_nodes));
 			for (j = s->lb_nodes_tot; j < s->next_eweight; j++) {
 				s->lb_nodes[j].server = s;
-				s->lb_nodes[j].node.key = full_hash(s->puid * SRV_EWGHT_RANGE + j);
+				s->lb_nodes[j].node.key = chash_compute_node_key(s, j);
 			}
 			s->lb_nodes_tot = s->next_eweight;
 		}
@@ -238,9 +318,6 @@ static void chash_update_server_weight(struct server *srv)
 	int old_state, new_state;
 	struct proxy *p = srv->proxy;
 
-	if (!srv_lb_status_changed(srv))
-		return;
-
 	/* If changing the server's weight changes its state, we simply apply
 	 * the procedures we already have for status change. If the state
 	 * remains down, the server is not in any tree, so it's as easy as
@@ -505,9 +582,10 @@ int chash_init_server_tree(struct proxy *p)
 			ha_alert("failed to allocate lb_nodes for server %s.\n", srv->id);
 			return -1;
 		}
+		srv->lb_server_key = chash_compute_server_key(srv);
 		for (node = 0; node < srv->lb_nodes_tot; node++) {
 			srv->lb_nodes[node].server = srv;
-			srv->lb_nodes[node].node.key = full_hash(srv->puid * SRV_EWGHT_RANGE + node);
+			srv->lb_nodes[node].node.key = chash_compute_node_key(srv, node);
 		}
 
 		if (srv_currently_usable(srv))
diff --git a/src/server.c b/src/server.c
index df24612f5..f5ffeb830 100644
--- a/src/server.c
+++ b/src/server.c
@@ -184,6 +184,9 @@ static void _srv_set_inetaddr_port(struct server *srv,
 	else
 		srv->flags &= ~SRV_F_MAPPORTS;
 
+	/* balancers (chash in particular) may use the addr in their routing decisions */
+	srv->proxy->lbprm.update_server_eweight(srv);
+
 	if (srv->log_target && srv->log_target->type == LOG_TARGET_DGRAM) {
 		/* server is used as a log target, manually update log target addr for DGRAM */
 		ipcpy(addr, srv->log_target->addr);
@@ -948,6 +951,32 @@ static int srv_parse_ws(char **args, int *cur_arg,
 	return 0;
 }
 
+/* Parse the "hash-key" server keyword */
+static int srv_parse_hash_key(char **args, int *cur_arg,
+			      struct proxy *curproxy, struct server *newsrv, char **err)
+{
+	if (!args[*cur_arg + 1]) {
+		memprintf(err, "'%s expects 'id', 'addr', or 'addr-port' value", args[*cur_arg]);
+		return ERR_ALERT | ERR_FATAL;
+	}
+
+	if (strcmp(args[*cur_arg + 1], "id") == 0) {
+		newsrv->hash_key = SRV_HASH_KEY_ID;
+	}
+	else if (strcmp(args[*cur_arg + 1], "addr") == 0) {
+		newsrv->hash_key = SRV_HASH_KEY_ADDR;
+	}
+	else if (strcmp(args[*cur_arg + 1], "addr-port") == 0) {
+		newsrv->hash_key = SRV_HASH_KEY_ADDR_PORT;
+	}
+	else {
+		memprintf(err, "'%s' has to be 'id', 'addr', or 'addr-port'", args[*cur_arg]);
+		return ERR_ALERT | ERR_FATAL;
+	}
+
+	return 0;
+}
+
 /* Parse the "init-addr" server keyword */
 static int srv_parse_init_addr(char **args, int *cur_arg,
                                struct proxy *curproxy, struct server *newsrv, char **err)
@@ -2221,6 +2250,7 @@ static struct srv_kw_list srv_kws = { "ALL", { }, {
 	{ "enabled",              srv_parse_enabled,              0,  1,  1 }, /* Start the server in 'enabled' state */
 	{ "error-limit",          srv_parse_error_limit,          1,  1,  1 }, /* Configure the consecutive count of check failures to consider a server on error */
 	{ "ws",                   srv_parse_ws,                   1,  1,  1 }, /* websocket protocol */
+	{ "hash-key",             srv_parse_hash_key,             1,  1,  1 }, /* Configure how chash keys are computed */
 	{ "id",                   srv_parse_id,                   1,  0,  1 }, /* set id# of server */
 	{ "init-addr",            srv_parse_init_addr,            1,  1,  0 }, /* */
 	{ "log-bufsize",          srv_parse_log_bufsize,          1,  1,  0 }, /* Set the ring bufsize for log server (only for log backends) */
@@ -2664,6 +2694,7 @@ void srv_settings_cpy(struct server *srv, const struct server *src, int srv_tmpl
 	srv->minconn                  = src->minconn;
 	srv->maxconn                  = src->maxconn;
 	srv->slowstart                = src->slowstart;
+	srv->hash_key                 = src->hash_key;
 	srv->observe                  = src->observe;
 	srv->onerror                  = src->onerror;
 	srv->onmarkeddown             = src->onmarkeddown;
-- 
2.44.0

Reply via email to