Hello.

Attached you can find the patch for bringing up support for consistent hashing over 2 servers instead of a single one.


Thanks,

Davide.

>From 0e801bc82906785b829fb7e5133031ce56be39b8 Mon Sep 17 00:00:00 2001
From: Andrea Fagiani <andfagi...@gmail.com>
Date: Mon, 22 Nov 2021 13:15:18 +0100
Subject: [PATCH] hash: add support for consistent (hashing) 2x algorithm

This algorithm, unlike the traditional and already supported one,
first identifies the closest server to the request's hash (which
is called the primary) and, then, also the next one (the secondary).

Next, they get evaluated as below:
* if both servers are usable, the algorithm picks one up randomly,
  taking their weight into account;
* if only one of them is usable, then it's the one to be chosen;
* if both are unusable, then the algorithm keeps looking for
  another one.

If a server goes down, it stays in the tree, it does not get removed
from the cluster. In such case, the algorithm will take care to guide
the decision toward its primary/secondary server, depending on the
server status.

The algorithm is deterministic on the primary/secondary servers couple,
whilst nondeterministic over which of the two will be picked up to
serve the request. This leads to a balance among the designated servers
which will serve the request.
---
 include/haproxy/backend-t.h |   3 +-
 include/haproxy/lb_chash.h  |   1 +
 src/cfgparse.c              |   5 ++
 src/lb_chash.c              | 128 ++++++++++++++++++++++++++++++++++--
 4 files changed, 131 insertions(+), 6 deletions(-)

diff --git a/include/haproxy/backend-t.h b/include/haproxy/backend-t.h
index 126528400..73cddebe4 100644
--- a/include/haproxy/backend-t.h
+++ b/include/haproxy/backend-t.h
@@ -109,7 +109,8 @@
 /* hash types */
 #define BE_LB_HASH_MAP    0x000000 /* map-based hash (default) */
 #define BE_LB_HASH_CONS   0x100000 /* consistent hashbit to indicate a dynamic algorithm */
-#define BE_LB_HASH_TYPE   0x100000 /* get/clear hash types */
+#define BE_LB_HASH_CONS2X 0x200000 /* consistent hashbit to indicate a dynamic algorithm */
+#define BE_LB_HASH_TYPE   0x300000 /* get/clear hash types */
 
 /* additional modifier on top of the hash function (only avalanche right now) */
 #define BE_LB_HMOD_AVAL   0x200000  /* avalanche modifier */
diff --git a/include/haproxy/lb_chash.h b/include/haproxy/lb_chash.h
index 79504574b..0aa72793a 100644
--- a/include/haproxy/lb_chash.h
+++ b/include/haproxy/lb_chash.h
@@ -30,6 +30,7 @@ struct server;
 int chash_init_server_tree(struct proxy *p);
 struct server *chash_get_next_server(struct proxy *p, struct server *srvtoavoid);
 struct server *chash_get_server_hash(struct proxy *p, unsigned int hash, const struct server *avoid);
+struct server *chash_get_server_hash_c2x(struct proxy *p, unsigned int hash);
 
 #endif /* _HAPROXY_LB_CHASH_H */
 
diff --git a/src/cfgparse.c b/src/cfgparse.c
index 58067c8ed..3b5d4f956 100644
--- a/src/cfgparse.c
+++ b/src/cfgparse.c
@@ -3474,6 +3474,11 @@ int check_config_validity()
 				if (chash_init_server_tree(curproxy) < 0) {
 					cfgerr++;
 				}
+			} else if ((curproxy->lbprm.algo & BE_LB_HASH_TYPE) == BE_LB_HASH_CONS2X) {
+				curproxy->lbprm.algo |= BE_LB_LKUP_CHTREE | BE_LB_PROP_DYN;
+				if (chash_init_server_tree(curproxy) < 0) {
+					cfgerr++;
+				}
 			} else {
 				curproxy->lbprm.algo |= BE_LB_LKUP_MAP;
 				init_server_map(curproxy);
diff --git a/src/lb_chash.c b/src/lb_chash.c
index 023219c98..66744762f 100644
--- a/src/lb_chash.c
+++ b/src/lb_chash.c
@@ -24,6 +24,12 @@
 #include <haproxy/server-t.h>
 #include <haproxy/tools.h>
 
+// prime number, just because
+#define CHASH_NUM_NODES 11587
+// do not replicate to machines having the same base id
+#define CHASH_BASE_NUM 1000
+#define CHASH_SRV_BASE(srv) ((int)(srv->puid / CHASH_BASE_NUM))
+
 /* Return next tree node after <node> which must still be in the tree, or be
  * NULL. Lookup wraps around the end to the beginning. If the next node is the
  * same node, return NULL. This is designed to find a valid next node before
@@ -155,7 +161,9 @@ static void chash_set_server_status_down(struct server *srv)
 		p->srv_act--;
 	}
 
-	chash_dequeue_srv(srv);
+	if ((p->lbprm.algo & BE_LB_HASH_TYPE) != BE_LB_HASH_CONS2X) {
+		chash_dequeue_srv(srv);
+	}
 
 out_update_backend:
 	/* check/update tot_used, tot_weight */
@@ -217,7 +225,9 @@ static void chash_set_server_status_up(struct server *srv)
 	}
 
 	/* note that eweight cannot be 0 here */
-	chash_queue_dequeue_srv(srv);
+	if ((p->lbprm.algo & BE_LB_HASH_TYPE) != BE_LB_HASH_CONS2X) {
+		chash_queue_dequeue_srv(srv);
+	}
 
  out_update_backend:
 	/* check/update tot_used, tot_weight */
@@ -268,7 +278,9 @@ static void chash_update_server_weight(struct server *srv)
 	HA_RWLOCK_WRLOCK(LBPRM_LOCK, &p->lbprm.lock);
 
 	/* only adjust the server's presence in the tree */
-	chash_queue_dequeue_srv(srv);
+	if ((p->lbprm.algo & BE_LB_HASH_TYPE) != BE_LB_HASH_CONS2X) {
+		chash_queue_dequeue_srv(srv);
+	}
 
 	if (srv->flags & SRV_F_BACKUP)
 		p->lbprm.tot_wbck += srv->next_eweight - srv->cur_eweight;
@@ -309,6 +321,104 @@ int chash_server_is_eligible(struct server *s)
 	return s->served < slots;
 }
 
+/*
+ * This function returns the running server from the CHASH (2x) tree.
+ * Differently from the traditional chash_get_server_hash() function,
+ * this first picks two servers <primary> and <secondary>,
+ * which are at the closest distance from the value of <hash>.
+ * If both are usable, it randomly chooses between them, taking their
+ * weights into account, and returns it.
+ * If one of them is unusable, it returns the latter.
+ * If both are unusable, then it looks for the next usable server in the
+ * tree.
+ * Keep in mind that when a server goes down, it does not get excluded
+ * from the cluster, staying in the tree: the algorithm itself will
+ * guide its decision towards its primary/secondary.
+ */
+struct server *chash_get_server_hash_c2x(struct proxy *p, unsigned int hash)
+{
+	struct eb32_node *start, *cur;
+	struct server *srv = NULL, *primary = NULL, *secondary = NULL, *first_usable = NULL;
+	struct eb_root *root;
+
+	if (p->srv_act)
+		root = &p->lbprm.chash.act;
+	else if (p->lbprm.fbck)
+		return p->lbprm.fbck;
+	else if (p->srv_bck)
+		root = &p->lbprm.chash.bck;
+	else
+		return NULL;
+
+	/* find the node after and the node before */
+	cur = eb32_lookup_ge(root, hash);
+	if (!cur)
+		cur = eb32_first(root);
+	if (!cur)
+		return NULL; /* tree is empty */
+
+	start = cur;
+	primary = eb32_entry(cur, struct tree_occ, node)->server;
+	int base_id = CHASH_SRV_BASE(primary);
+
+	/* find the secondary replica */
+	cur = eb32_next(start);
+	if (!cur)
+		cur = eb32_first(root);
+
+	while (cur != start) {
+		srv = eb32_entry(cur, struct tree_occ, node)->server;
+		if (CHASH_SRV_BASE(srv) != base_id) {
+			secondary = srv;
+			break;
+		}
+
+		cur = eb32_next(cur);
+		if (!cur)
+			cur = eb32_first(root);
+	}
+
+	int primary_usable = srv_currently_usable (primary);
+	int secondary_usable = secondary ? srv_currently_usable (secondary) : 0;
+
+	if (primary_usable && secondary_usable) {
+		uint64_t rsum = primary->cur_eweight + secondary->cur_eweight;
+		/* copied from sample.c:smp_fetch_rand() */
+		uint64_t r = ((uint64_t)random() * rsum) / ((uint64_t)RAND_MAX+1);
+		return r < primary->cur_eweight ? primary : secondary;
+	} else if (primary_usable) {
+		return primary;
+	} else if (secondary_usable) {
+		return secondary;
+	} else {
+		/* find a usable server, possibly with different base, regardless of the weight */
+		cur = eb32_next(start);
+		if (!cur)
+			cur = eb32_first(root);
+
+		while (cur != start) {
+			srv = eb32_entry(cur, struct tree_occ, node)->server;
+			if (srv_currently_usable(srv)) {
+				if (CHASH_SRV_BASE(srv) != base_id)
+					return srv;
+				else if (!first_usable)
+					first_usable = srv;
+			}
+
+			cur = eb32_next(cur);
+			if (!cur)
+				cur = eb32_first(root);
+
+		}
+	}
+
+	/* 
+	 * at this point we couldn't find any replica with different base
+	 * just return the first usable server, if any
+	 */
+	return first_usable;
+}
+
 /*
  * This function returns the running server from the CHASH tree, which is at
  * the closest distance from the value of <hash>. Doing so ensures that even
@@ -326,6 +436,10 @@ struct server *chash_get_server_hash(struct proxy *p, unsigned int hash, const s
 	struct eb_root *root;
 	unsigned int dn, dp;
 	int loop;
+	if ((p->lbprm.algo & BE_LB_HASH_TYPE) == BE_LB_HASH_CONS2X) {
+		nsrv = chash_get_server_hash_c2x(p, hash);
+		return nsrv;
+	}
 
 	HA_RWLOCK_RDLOCK(LBPRM_LOCK, &p->lbprm.lock);
 
@@ -497,7 +611,11 @@ int chash_init_server_tree(struct proxy *p)
 	/* queue active and backup servers in two distinct groups */
 	for (srv = p->srv; srv; srv = srv->next) {
 		srv->lb_tree = (srv->flags & SRV_F_BACKUP) ? &p->lbprm.chash.bck : &p->lbprm.chash.act;
-		srv->lb_nodes_tot = srv->uweight * BE_WEIGHT_SCALE;
+		if ((p->lbprm.algo & BE_LB_HASH_TYPE) == BE_LB_HASH_CONS2X) {
+			srv->lb_nodes_tot = CHASH_NUM_NODES;
+		} else {
+			srv->lb_nodes_tot = srv->uweight * BE_WEIGHT_SCALE;
+		}
 		srv->lb_nodes_now = 0;
 		srv->lb_nodes = calloc(srv->lb_nodes_tot,
 				       sizeof(*srv->lb_nodes));
@@ -510,7 +628,7 @@ int chash_init_server_tree(struct proxy *p)
 			srv->lb_nodes[node].node.key = full_hash(srv->puid * SRV_EWGHT_RANGE + node);
 		}
 
-		if (srv_currently_usable(srv))
+		if (((p->lbprm.algo & BE_LB_HASH_TYPE) == BE_LB_HASH_CONS2X) || srv_currently_usable(srv))
 			chash_queue_dequeue_srv(srv);
 	}
 	return 0;
-- 
2.30.2

Reply via email to