Re: Patch series for New Balancing algorithm (Peak) EWMA

Aleksandar Lazic Thu, 05 Feb 2026 23:50:07 -0800

Hi.

On 2026-02-06 (Fr.) 01:02, Aleksandar Lazic wrote:

Hi.


here my first attempt to add the (Peak) EWMA to HAProxy.

In the first post was the doc and reg-test missing this patches are now added inthat mail.


Regards
Aleks

From 7db40d8b38865f0bb065aae09b7f3f12d36e582c Mon Sep 17 00:00:00 2001
From: Aleksandar Lazic <[email protected]>
Date: Fri, 6 Feb 2026 08:45:32 +0100
Subject: [PATCH 10/10] MEDIUM/backend: Add Peak EWMA load balancing algorithm

refer to https://github.com/haproxy/haproxy/issues/1570

Signed-off-by: Aleksandar Lazic <[email protected]>
---
 doc/configuration.txt | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/doc/configuration.txt b/doc/configuration.txt
index b207e2c445..5bc3b88c23 100644
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@@ -6336,6 +6336,28 @@ balance url_param <param> [check_post]
                   This algorithm is only usable for backends in LOG mode, for
                   others, please use "hash" instead.
 
+      peak-ewma   The server with the lowest estimated load receives the
+                  connection. The estimated load is computed as the product of
+                  the server's exponentially weighted moving average (EWMA)
+                  response time and the number of in-flight requests, divided
+                  by the server's weight:
+
+                    score = latency * (inflight + 1) / weight
+
+                  This algorithm combines latency awareness with connection
+                  counting, naturally preferring fast servers over slow ones.
+                  It is particularly well suited for environments where backend
+                  servers have heterogeneous performance characteristics or
+                  where response times vary significantly. When a server has
+                  no measured response time yet (cold start), the latency
+                  defaults to 1, causing the algorithm to degrade gracefully
+                  to the equivalent of weighted least connections. This
+                  algorithm is dynamic, which means that server weights may be
+                  adjusted on the fly for slow starts for instance. It will
+                  also consider the number of queued connections in addition to
+                  the established ones in order to minimize queuing. This
+                  algorithm is not usable in LOG mode.
+
       sticky      Tries to stick to the same server as much as possible. The
                   first server in the list of available servers receives all
                   the log messages. When the server goes DOWN, the next server
-- 
2.43.0

From 5ce148a3621d9e97d3118e582556c763dbc2667d Mon Sep 17 00:00:00 2001
From: Aleksandar Lazic <[email protected]>
Date: Fri, 6 Feb 2026 00:59:46 +0100
Subject: [PATCH 08/10] MEDIUM/backend: Add Peak EWMA load balancing algorithm

refer to https://github.com/haproxy/haproxy/issues/1570

Signed-off-by: Aleksandar Lazic <[email protected]>
---
 src/lb_pewma.c | 938 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 938 insertions(+)
 create mode 100644 src/lb_pewma.c

diff --git a/src/lb_pewma.c b/src/lb_pewma.c
new file mode 100644
index 0000000000..69ac79fbe9
--- /dev/null
+++ b/src/lb_pewma.c
@@ -0,0 +1,938 @@
+/*
+ * Peak EWMA load balancing algorithm.
+ *
+ * This algorithm selects the server with the lowest estimated load
+ * computed as: latency * (inflight + 1) / weight, where latency is
+ * the EWMA of total response time (t_time). It combines response time
+ * awareness with connection counting, preferring fast servers over slow
+ * ones. The tree structure and concurrency logic is derived from the
+ * Fast Weighted Least Connection (FWLC) algorithm.
+ *
+ * Copyright 2026 Aleksandar Lazic <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <import/eb32tree.h>
+#include <haproxy/api.h>
+#include <haproxy/backend.h>
+#include <haproxy/freq_ctr.h>
+#include <haproxy/queue.h>
+#include <haproxy/server-t.h>
+#include <haproxy/task.h>
+#include <haproxy/tools.h>
+
+/* We reuse the same tree element layout as FWLC. The struct fwlc_tree_elt
+ * type is forward-declared in server-t.h via pointer members. We provide
+ * the full definition here so that we can access its fields.
+ */
+struct fwlc_tree_elt {
+	struct mt_list srv_list[PEWMA_LISTS_NB];
+	struct mt_list free_list;
+	struct eb32_node lb_node;
+	unsigned int elements;
+};
+
+DECLARE_STATIC_TYPED_POOL(pool_head_pewma_elt, "pewma_tree_elt", struct fwlc_tree_elt);
+
+#define PEWMA_LBPRM_SEQ(lbprm)		((lbprm) & 0xffffffff)
+#define PEWMA_LBPRM_SMALLEST(lbprm)	((lbprm) >> 32)
+
+/*
+ * Atomically try to update the sequence number, and the smallest key for which there is at least one server.
+ * Returns 1 on success, and 0 on failure.
+ */
+static int pewma_set_seq_and_smallest(struct lbprm *lbprm, uint64_t current, unsigned int seq, unsigned int smallest)
+{
+	uint64_t dst_nb = seq | ((uint64_t)smallest << 32);
+	int ret;
+#if defined(HA_CAS_IS_8B)
+	ret =  _HA_ATOMIC_CAS(&lbprm->lb_seq, &current, dst_nb);
+#elif defined(HA_HAVE_CAS_DW)
+	ret = _HA_ATOMIC_DWCAS(&lbprm->lb_seq, &current, &dst_nb);
+#else
+	__decl_thread(static HA_SPINLOCK_T seq_lock);
+
+	HA_SPIN_LOCK(OTHER_LOCK, &seq_lock);
+	if (lbprm->lb_seq == current) {
+		lbprm->lb_seq = dst_nb;
+		ret = 1;
+	} else
+		ret = 0;
+	HA_SPIN_UNLOCK(OTHER_LOCK, &seq_lock);
+#endif
+	return ret;
+
+}
+
+/* Remove a server from a tree. It must have previously been dequeued. This
+ * function is meant to be called when a server is going down or has its
+ * weight disabled.
+ *
+ * The server's lock and the lbprm's lock must be held.
+ */
+static inline void pewma_remove_from_tree(struct server *s)
+{
+	s->lb_tree = NULL;
+}
+
+/*
+ * Remove anything allocated by the proxy
+ */
+static void pewma_proxy_deinit(struct proxy *p)
+{
+	struct fwlc_tree_elt *tree_elt;
+
+	while ((tree_elt = MT_LIST_POP(&p->lbprm.lb_free_list, struct fwlc_tree_elt *, free_list)) != NULL) {
+		pool_free(pool_head_pewma_elt, tree_elt);
+	}
+}
+
+/*
+ * Remove anything allocated by the server
+ */
+static void pewma_server_deinit(struct server *s)
+{
+	if (s->free_elt) {
+		pool_free(pool_head_pewma_elt, s->free_elt);
+		s->free_elt = NULL;
+	}
+}
+
+/* simply removes a server from a tree.
+ *
+ * The lbprm's lock must be held.
+ */
+static inline void pewma_dequeue_srv(struct server *s)
+{
+	struct fwlc_tree_elt *tree_elt = s->tree_elt;
+	unsigned int elts;
+
+	MT_LIST_DELETE(&s->lb_mt_list);
+	if (tree_elt) {
+		elts = _HA_ATOMIC_FETCH_SUB(&tree_elt->elements, 1);
+		/* We are the last element, we can nuke the node */
+		if (elts == 1) {
+			if (PEWMA_LBPRM_SMALLEST(s->proxy->lbprm.lb_seq) == tree_elt->lb_node.key) {
+				/*
+				 * We were the smallest one, and now we're
+				 * gone, reset it
+				 */
+				/*
+				 * We're holding the lbprm lock so this should never fail,
+				 * as nobody should be around to modify it
+				 */
+				do {
+				} while (pewma_set_seq_and_smallest(&s->proxy->lbprm, s->proxy->lbprm.lb_seq, PEWMA_LBPRM_SEQ(s->proxy->lbprm.lb_seq) + 1, 0) == 0 && __ha_cpu_relax());
+
+			}
+			eb32_delete(&tree_elt->lb_node);
+		}
+	}
+	s->tree_elt = NULL;
+	if (s->free_elt) {
+		pool_free(pool_head_pewma_elt, s->free_elt);
+		s->free_elt = NULL;
+	}
+}
+
+/*
+ * Allocate a tree element, either from the free list, from an element provided, or
+ * from allocation.
+ * Must be called with the wrlock
+ */
+static struct fwlc_tree_elt *pewma_alloc_tree_elt(struct proxy *p, struct fwlc_tree_elt *allocated_elt)
+{
+	struct fwlc_tree_elt *tree_elt = NULL;
+	int i = 0;
+
+	if (p->lbprm.lb_free_list_nb >= PEWMA_MIN_FREE_ENTRIES) {
+		while ((tree_elt = MT_LIST_POP(&p->lbprm.lb_free_list, struct fwlc_tree_elt *, free_list)) != NULL) {
+			MT_LIST_APPEND(&p->lbprm.lb_free_list, &tree_elt->free_list);
+			if (tree_elt->elements == 0) {
+				eb32_delete(&tree_elt->lb_node);
+				if (i == 0) {
+					struct fwlc_tree_elt *tmptree;
+
+					tmptree = MT_LIST_POP(&p->lbprm.lb_free_list, struct fwlc_tree_elt *, free_list);
+					/*
+					 * Check if the next element still contains servers, and if not,
+					 * just free it, to do some cleanup.
+					 */
+					if (tmptree && tmptree->elements == 0) {
+						eb32_delete(&tmptree->lb_node);
+						pool_free(pool_head_pewma_elt, tmptree);
+						p->lbprm.lb_free_list_nb--;
+					} else if (tmptree)
+						MT_LIST_APPEND(&p->lbprm.lb_free_list, &tmptree->free_list);
+				}
+				return tree_elt;
+		}
+			i++;
+			if (i > 3)
+				break;
+		}
+	}
+	if (!allocated_elt) {
+		tree_elt = pool_alloc(pool_head_pewma_elt);
+		if (!tree_elt)
+			return NULL;
+	} else
+		tree_elt = allocated_elt;
+
+	for (i = 0; i < PEWMA_LISTS_NB; i++) {
+		MT_LIST_INIT(&tree_elt->srv_list[i]);
+	}
+	MT_LIST_INIT(&tree_elt->free_list);
+	MT_LIST_APPEND(&p->lbprm.lb_free_list, &tree_elt->free_list);
+	p->lbprm.lb_free_list_nb++;
+	tree_elt->elements = 0;
+	return tree_elt;
+}
+
+/*
+ * Return the tree element for the provided key, allocate it first if needed.
+ * Must be called with the lbprm lock held.
+ */
+static struct fwlc_tree_elt *pewma_get_tree_elt(struct server *s, u32 key)
+{
+	struct eb32_node *node;
+	struct fwlc_tree_elt *tree_elt = NULL;
+
+	node = eb32_lookup(s->lb_tree, key);
+	if (node)
+		tree_elt = container_of(node, struct fwlc_tree_elt, lb_node);
+	if (!tree_elt) {
+		/* No element available, we have to allocate one */
+		tree_elt = pewma_alloc_tree_elt(s->proxy, NULL);
+		if (!tree_elt)
+			return NULL;
+		tree_elt->lb_node.key = key;
+		eb32_insert(s->lb_tree, &tree_elt->lb_node);
+	}
+	return tree_elt;
+}
+
+/* Queue a server in its associated tree, assuming the <eweight> is >0.
+ * Servers are sorted by latency * (#conns+1) / weight. To ensure maximum
+ * accuracy, we use latency * (#conns+1) * SRV_EWGHT_MAX / eweight as the
+ * sorting key. The latency is the EWMA of total response time (t_time).
+ * When no latency data is available (cold start), latency defaults to 1,
+ * degrading to FWLC behavior.
+ *
+ * NOTE: Depending on the calling context, we use s->next_eweight or
+ *       s->cur_eweight. The next value is used when the server state is updated
+ *       (because the weight changed for instance). During this step, the server
+ *       state is not yet committed. The current value is used to reposition the
+ *       server in the tree. This happens when the server is used.
+ *
+ * The lbprm's lock must be held.
+ */
+static inline void pewma_queue_srv(struct server *s, unsigned int eweight)
+{
+	struct fwlc_tree_elt *tree_elt;
+	unsigned int inflight = _HA_ATOMIC_LOAD(&s->served) + _HA_ATOMIC_LOAD(&s->queueslength);
+	unsigned int list_nb;
+	unsigned int latency;
+	u32 key;
+
+	if (inflight) {
+		unsigned long long raw;
+
+		latency = swrate_avg(_HA_ATOMIC_LOAD(&s->counters.t_time), TIME_STATS_SAMPLES);
+		if (!latency)
+			latency = 1;
+		raw = (unsigned long long)latency * (inflight + 1) * SRV_EWGHT_MAX / eweight;
+		key = (raw > (unsigned long long)UINT32_MAX) ? UINT32_MAX : (u32)raw;
+	} else {
+		key = 0;
+	}
+
+	tree_elt = pewma_get_tree_elt(s, key);
+	if (tree_elt == NULL) {
+		/*
+		 * We failed to allocate memory for the tree_elt, just stop
+		 * now and schedule the requeue tasklet which will take care
+		 * of the queueing later.
+		 * If the tasklet doesn't exist yet, then there is nothing to
+		 * do, as it will be eventually scheduled after being created.
+		 */
+		tasklet_wakeup(s->requeue_tasklet);
+		return;
+	}
+	list_nb = statistical_prng_range(PEWMA_LISTS_NB);
+	MT_LIST_APPEND(&tree_elt->srv_list[list_nb], &s->lb_mt_list);
+	s->tree_elt = tree_elt;
+	_HA_ATOMIC_INC(&tree_elt->elements);
+	if (PEWMA_LBPRM_SMALLEST(s->proxy->lbprm.lb_seq) > key) {
+		/*
+		 * We're holding the lbprm lock so this should never fail,
+		 * as nobody should be around to modify it
+		 */
+		do {
+		} while (pewma_set_seq_and_smallest(&s->proxy->lbprm, s->proxy->lbprm.lb_seq, PEWMA_LBPRM_SEQ(s->proxy->lbprm.lb_seq) + 1, key) == 0);
+	}
+}
+
+/*
+ * Loop across the different lists until we find an unlocked one, and lock it.
+ */
+static __inline struct mt_list pewma_lock_target_list(struct fwlc_tree_elt *tree_elt)
+{
+	struct mt_list list = {NULL, NULL};
+	int i;
+	int dst_list;
+
+
+	dst_list = statistical_prng_range(PEWMA_LISTS_NB);
+
+	while (list.next == NULL) {
+		for (i = 0; i < PEWMA_LISTS_NB; i++) {
+			list = mt_list_try_lock_prev(&tree_elt->srv_list[(dst_list + i) % PEWMA_LISTS_NB]);
+			if (list.next != NULL)
+				break;
+		}
+	}
+	return list;
+}
+
+/*
+ * Calculate the key to be used for a given server.
+ * Peak EWMA key = latency * (inflight + 1) * SRV_EWGHT_MAX / eweight
+ */
+static inline unsigned int pewma_get_key(struct server *s)
+{
+	unsigned int inflight;
+	unsigned int eweight;
+	unsigned int latency;
+	unsigned long long raw_key;
+
+	inflight = _HA_ATOMIC_LOAD(&s->served) + _HA_ATOMIC_LOAD(&s->queueslength);
+	eweight = _HA_ATOMIC_LOAD(&s->cur_eweight);
+
+	/* Use the existing EWMA of total response time (t_time).
+	 * swrate_avg() recovers the average from the sliding window sum.
+	 * If no data yet (cold start), use 1 to degrade to pure FWLC behavior.
+	 */
+	latency = swrate_avg(_HA_ATOMIC_LOAD(&s->counters.t_time), TIME_STATS_SAMPLES);
+	if (!latency)
+		latency = 1;
+
+	if (!inflight)
+		return 0;
+
+	/* Use 64-bit intermediate to prevent overflow, then saturate to u32 */
+	raw_key = (unsigned long long)latency * (inflight + 1) * SRV_EWGHT_MAX / (eweight ? eweight : 1);
+	return (raw_key > (unsigned long long)UINT32_MAX) ? UINT32_MAX : (unsigned int)raw_key;
+}
+
+/*
+ * Only one thread will try to update a server position at a given time,
+ * thanks to the lb_lock. However that means that by the time we are done
+ * with the update, a new one might be needed, so check for that and
+ * schedule the tasklet if needed, once we dropped the lock.
+ */
+static inline void pewma_check_srv_key(struct server *s, unsigned int expected)
+{
+	unsigned int key = pewma_get_key(s);
+
+	if (key != expected && s->requeue_tasklet)
+		tasklet_wakeup(s->requeue_tasklet);
+}
+
+/* Re-position the server in the Peak EWMA tree after it has been assigned one
+ * connection or after it has released one. Note that it is possible that
+ * the server has been moved out of the tree due to failed health-checks.
+ * The lbprm's lock will be used.
+ */
+static void pewma_srv_reposition(struct server *s)
+{
+	struct mt_list to_unlock;
+	struct fwlc_tree_elt *tree_elt = NULL, *allocated_elt = NULL;
+	struct eb32_node *node;
+	struct mt_list list;
+	uint64_t cur_seq = 0;
+	unsigned int eweight = _HA_ATOMIC_LOAD(&s->cur_eweight);
+	unsigned int new_key;
+	unsigned int smallest;
+	int srv_lock;
+
+	HA_RWLOCK_RDLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock);
+	new_key = pewma_get_key(s);
+	/* some calls will be made for no change (e.g connect_server() after
+	 * assign_server(). Let's check that first.
+	 */
+	if ((s->tree_elt && s->tree_elt->lb_node.node.leaf_p && eweight &&
+	    s->tree_elt->lb_node.key == new_key) || !s->lb_tree) {
+		HA_RWLOCK_RDUNLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock);
+		return;
+	}
+
+	srv_lock = HA_ATOMIC_XCHG(&s->lb_lock, 1);
+	/* Somebody else is updating that server, give up */
+	if (srv_lock == 1) {
+		HA_RWLOCK_RDUNLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock);
+		return;
+	}
+
+	node = eb32_lookup(s->lb_tree, new_key);
+	if (node)
+		tree_elt = container_of(node, struct fwlc_tree_elt, lb_node);
+		/*
+		 * It is possible that s->tree_elt was changed since we checked
+		 * As s->tree_elt is only changed while holding s->lb_lock,
+		 * check again now that we acquired it, and if we're using
+		 * the right element, do nothing.
+		 */
+	if (s->tree_elt && tree_elt == s->tree_elt) {
+		HA_RWLOCK_RDUNLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock);
+		_HA_ATOMIC_STORE(&s->lb_lock, 0);
+		pewma_check_srv_key(s, new_key);
+		return;
+	}
+	/*
+	 * We have to allocate a new tree element, and/or remove the
+	 * previous element, we will modify the tree, so let's get the write
+	 * lock.
+	 */
+	if (!tree_elt) {
+		unsigned int new_new_key;
+
+		/*
+		 * We don't want to allocate something while holding the lock,
+		 * so make sure we have something allocated before.
+		 */
+		if (s->free_elt != NULL) {
+			allocated_elt = s->free_elt;
+			s->free_elt = NULL;
+		} else
+			allocated_elt = pool_alloc(pool_head_pewma_elt);
+		if (HA_RWLOCK_TRYRDTOWR(LBPRM_LOCK, &s->proxy->lbprm.lock) != 0) {
+			/* there's already some contention on the tree's lock, there's
+			 * no point insisting. Better wake up the server's tasklet that
+			 * will let this or another thread retry later. For the time
+			 * being, the server's apparent load is slightly inaccurate but
+			 * we don't care, if there is contention, it will self-regulate.
+			 */
+			if (s->requeue_tasklet)
+				tasklet_wakeup(s->requeue_tasklet);
+			HA_RWLOCK_RDUNLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock);
+			s->free_elt = allocated_elt;
+			_HA_ATOMIC_STORE(&s->lb_lock, 0);
+			return;
+		}
+
+		/* we might have been waiting for a while on the lock above
+		 * so it's worth testing again because other threads are very
+		 * likely to have released a connection or taken one leading
+		 * to our target value (50% of the case in measurements).
+		 */
+
+		new_new_key = pewma_get_key(s);
+		if (new_new_key != new_key) {
+			if (s->tree_elt &&
+			    s->tree_elt->lb_node.node.leaf_p &&
+			    eweight && s->tree_elt->lb_node.key == new_new_key) {
+				/* Okay after all we have nothing to do */
+				HA_RWLOCK_WRUNLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock);
+				s->free_elt = allocated_elt;
+				_HA_ATOMIC_STORE(&s->lb_lock, 0);
+				pewma_check_srv_key(s, new_new_key);
+				return;
+			}
+			node = eb32_lookup(s->lb_tree, new_new_key);
+			if (node) {
+				tree_elt = container_of(node, struct fwlc_tree_elt, lb_node);
+				HA_RWLOCK_WRTORD(LBPRM_LOCK, &s->proxy->lbprm.lock);
+				s->free_elt = allocated_elt;
+				allocated_elt = NULL;
+			} else
+				tree_elt = NULL;
+			new_key = new_new_key;
+		}
+	}
+
+	/*
+	 * Now we increment the number of elements in the new tree_elt,
+	 * we change our sequence number and smallest, and we then
+	 * decrement the number of elements in the old tree_elt.
+	 * It is important to keep this sequencing, as pewma_get_next_server()
+	 * uses the number of elements to know if there is something to look for,
+	 * and we want to make sure we do not miss a server.
+	 */
+	if (!tree_elt) {
+		/*
+		 * There were no tree element matching our key,
+		 * allocate one and insert it into the tree
+		 */
+		tree_elt = pewma_alloc_tree_elt(s->proxy, allocated_elt);
+		if (tree_elt == NULL) {
+			/* We failed to allocate memory, just try again later */
+			HA_RWLOCK_RDUNLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock);
+			_HA_ATOMIC_STORE(&s->lb_lock, 0);
+			if (s->requeue_tasklet)
+				tasklet_wakeup(s->requeue_tasklet);
+			return;
+		}
+		if (tree_elt == allocated_elt)
+			allocated_elt = NULL;
+		tree_elt->lb_node.key = new_key;
+		tree_elt->elements = 1;
+		__ha_barrier_store();
+		/* If we allocated, then we hold the write lock */
+		eb32_insert(s->lb_tree, &tree_elt->lb_node);
+		HA_RWLOCK_WRTORD(LBPRM_LOCK, &s->proxy->lbprm.lock);
+	} else {
+		_HA_ATOMIC_INC(&tree_elt->elements);
+	}
+
+	__ha_barrier_store();
+	/*
+	 * Update the sequence number, and the smallest if needed.
+	 * We always have to do it, even if we're not actually
+	 * updating the smallest one, otherwise we'll get an
+	 * ABA problem and a server may be missed when looked up.
+	 * The only time we don't have to do it if is another thread
+	 * increased it, and the new smallest element is not
+	 * higher than our new key.
+	 */
+	do {
+                unsigned int tmpsmallest;
+		uint64_t newcurseq = _HA_ATOMIC_LOAD(&s->proxy->lbprm.lb_seq);
+
+		if (cur_seq != 0 && PEWMA_LBPRM_SEQ(newcurseq) >
+		   PEWMA_LBPRM_SEQ(cur_seq) && new_key >= PEWMA_LBPRM_SMALLEST(newcurseq))
+			break;
+
+		cur_seq = newcurseq;
+                tmpsmallest = PEWMA_LBPRM_SMALLEST(cur_seq);
+                if (new_key > tmpsmallest)
+                        smallest = tmpsmallest;
+		else
+                        smallest = new_key;
+
+        } while (pewma_set_seq_and_smallest(&s->proxy->lbprm, cur_seq, PEWMA_LBPRM_SEQ(cur_seq) + 1, smallest) == 0 && __ha_cpu_relax());
+
+	__ha_barrier_store();
+
+	if (likely(s->tree_elt)) {
+		_HA_ATOMIC_DEC(&s->tree_elt->elements);
+
+		/*
+		 * Now lock the existing element, and its target list.
+		 * To prevent a deadlock, we always lock the one
+		 * with the lowest key first.
+		 */
+		if (new_key < s->tree_elt->lb_node.key) {
+			to_unlock = mt_list_lock_full(&s->lb_mt_list);
+			list = pewma_lock_target_list(tree_elt);
+		} else {
+			list = pewma_lock_target_list(tree_elt);
+			to_unlock = mt_list_lock_full(&s->lb_mt_list);
+		}
+
+		/*
+		 * Unlock the old list, the element is now
+		 * no longer in it.
+		 */
+		mt_list_unlock_link(to_unlock);
+	} else
+		list = pewma_lock_target_list(tree_elt);
+
+	/*
+	 * Add the element to the new list, and unlock it.
+	 */
+	mt_list_unlock_full(&s->lb_mt_list, list);
+
+	s->tree_elt = tree_elt;
+
+	HA_RWLOCK_RDUNLOCK(LBPRM_LOCK, &s->proxy->lbprm.lock);
+
+	if (allocated_elt)
+		s->free_elt = allocated_elt;
+
+	__ha_barrier_store();
+	_HA_ATOMIC_STORE(&s->lb_lock, 0);
+
+	pewma_check_srv_key(s, new_key);
+}
+
+/* This function updates the server trees according to server <srv>'s new
+ * state. It should be called when server <srv>'s status changes to down.
+ * It is not important whether the server was already down or not. It is not
+ * important either that the new state is completely down (the caller may not
+ * know all the variables of a server's state).
+ *
+ * The server's lock must be held. The lbprm's lock will be used.
+ */
+static void pewma_set_server_status_down(struct server *srv)
+{
+	struct proxy *p = srv->proxy;
+
+	if (!srv_lb_status_changed(srv))
+		return;
+
+	if (srv_willbe_usable(srv))
+		goto out_update_state;
+	HA_RWLOCK_WRLOCK(LBPRM_LOCK, &p->lbprm.lock);
+
+
+	if (!srv_currently_usable(srv))
+		/* server was already down */
+		goto out_update_backend;
+
+	if (srv->flags & SRV_F_BACKUP) {
+		p->lbprm.tot_wbck -= srv->cur_eweight;
+		p->srv_bck--;
+
+		if (srv == p->lbprm.fbck) {
+			/* we lost the first backup server in a single-backup
+			 * configuration, we must search another one.
+			 */
+			struct server *srv2 = p->lbprm.fbck;
+			do {
+				srv2 = srv2->next;
+			} while (srv2 &&
+				 !((srv2->flags & SRV_F_BACKUP) &&
+				   srv_willbe_usable(srv2)));
+			p->lbprm.fbck = srv2;
+		}
+	} else {
+		p->lbprm.tot_wact -= srv->cur_eweight;
+		p->srv_act--;
+	}
+
+	pewma_dequeue_srv(srv);
+	pewma_remove_from_tree(srv);
+
+out_update_backend:
+	/* check/update tot_used, tot_weight */
+	update_backend_weight(p);
+	HA_RWLOCK_WRUNLOCK(LBPRM_LOCK, &p->lbprm.lock);
+
+ out_update_state:
+	srv_lb_commit_status(srv);
+}
+
+/* This function updates the server trees according to server <srv>'s new
+ * state. It should be called when server <srv>'s status changes to up.
+ * It is not important whether the server was already down or not. It is not
+ * important either that the new state is completely UP (the caller may not
+ * know all the variables of a server's state). This function will not change
+ * the weight of a server which was already up.
+ *
+ * The server's lock must be held. The lbprm's lock will be used.
+ */
+static void pewma_set_server_status_up(struct server *srv)
+{
+	struct proxy *p = srv->proxy;
+
+	if (!srv_lb_status_changed(srv))
+		return;
+
+	if (!srv_willbe_usable(srv))
+		goto out_update_state;
+
+	HA_RWLOCK_WRLOCK(LBPRM_LOCK, &p->lbprm.lock);
+
+	if (srv_currently_usable(srv))
+		/* server was already up */
+		goto out_update_backend;
+
+	if (srv->flags & SRV_F_BACKUP) {
+		srv->lb_tree = &p->lbprm.pewma.bck;
+		p->lbprm.tot_wbck += srv->next_eweight;
+		p->srv_bck++;
+
+		if (!(p->options & PR_O_USE_ALL_BK)) {
+			if (!p->lbprm.fbck) {
+				/* there was no backup server anymore */
+				p->lbprm.fbck = srv;
+			} else {
+				/* we may have restored a backup server prior to fbck,
+				 * in which case it should replace it.
+				 */
+				struct server *srv2 = srv;
+				do {
+					srv2 = srv2->next;
+				} while (srv2 && (srv2 != p->lbprm.fbck));
+				if (srv2)
+					p->lbprm.fbck = srv;
+			}
+		}
+	} else {
+		srv->lb_tree = &p->lbprm.pewma.act;
+		p->lbprm.tot_wact += srv->next_eweight;
+		p->srv_act++;
+	}
+
+	/* note that eweight cannot be 0 here */
+	pewma_queue_srv(srv, srv->next_eweight);
+
+ out_update_backend:
+	/* check/update tot_used, tot_weight */
+	update_backend_weight(p);
+	HA_RWLOCK_WRUNLOCK(LBPRM_LOCK, &p->lbprm.lock);
+
+ out_update_state:
+	srv_lb_commit_status(srv);
+}
+
+/* This function must be called after an update to server <srv>'s effective
+ * weight. It may be called after a state change too.
+ *
+ * The server's lock must be held. The lbprm's lock will be used.
+ */
+static void pewma_update_server_weight(struct server *srv)
+{
+	int old_state, new_state;
+	struct proxy *p = srv->proxy;
+
+	if (!srv_lb_status_changed(srv))
+		return;
+
+	/* If changing the server's weight changes its state, we simply apply
+	 * the procedures we already have for status change. If the state
+	 * remains down, the server is not in any tree, so it's as easy as
+	 * updating its values. If the state remains up with different weights,
+	 * there are some computations to perform to find a new place and
+	 * possibly a new tree for this server.
+	 */
+
+	old_state = srv_currently_usable(srv);
+	new_state = srv_willbe_usable(srv);
+
+	if (!old_state && !new_state) {
+		srv_lb_commit_status(srv);
+		return;
+	}
+	else if (!old_state && new_state) {
+		pewma_set_server_status_up(srv);
+		return;
+	}
+	else if (old_state && !new_state) {
+		pewma_set_server_status_down(srv);
+		return;
+	}
+
+	HA_RWLOCK_WRLOCK(LBPRM_LOCK, &p->lbprm.lock);
+
+	if (srv->lb_tree)
+		pewma_dequeue_srv(srv);
+
+	if (srv->flags & SRV_F_BACKUP) {
+		p->lbprm.tot_wbck += srv->next_eweight - srv->cur_eweight;
+		srv->lb_tree = &p->lbprm.pewma.bck;
+	} else {
+		p->lbprm.tot_wact += srv->next_eweight - srv->cur_eweight;
+		srv->lb_tree = &p->lbprm.pewma.act;
+	}
+
+	pewma_queue_srv(srv, srv->next_eweight);
+
+	update_backend_weight(p);
+	HA_RWLOCK_WRUNLOCK(LBPRM_LOCK, &p->lbprm.lock);
+
+	srv_lb_commit_status(srv);
+}
+
+/* This function is responsible for building the trees in case of Peak EWMA.
+ * It also sets p->lbprm.wdiv to the eweight to uweight ratio. Both active
+ * and backup groups are initialized.
+ */
+void pewma_init_server_tree(struct proxy *p)
+{
+	struct server *srv;
+	struct eb_root init_head = EB_ROOT;
+
+	p->lbprm.set_server_status_up   = pewma_set_server_status_up;
+	p->lbprm.set_server_status_down = pewma_set_server_status_down;
+	p->lbprm.update_server_eweight  = pewma_update_server_weight;
+	p->lbprm.server_take_conn = pewma_srv_reposition;
+	p->lbprm.server_drop_conn = pewma_srv_reposition;
+	p->lbprm.server_requeue   = pewma_srv_reposition;
+	p->lbprm.server_deinit    = pewma_server_deinit;
+	p->lbprm.proxy_deinit     = pewma_proxy_deinit;
+
+	p->lbprm.wdiv = BE_WEIGHT_SCALE;
+	for (srv = p->srv; srv; srv = srv->next) {
+		srv->next_eweight = (srv->uweight * p->lbprm.wdiv + p->lbprm.wmult - 1) / p->lbprm.wmult;
+		srv_lb_commit_status(srv);
+	}
+
+	p->lbprm.lb_seq = 0;
+
+	recount_servers(p);
+	update_backend_weight(p);
+
+	p->lbprm.pewma.act = init_head;
+	p->lbprm.pewma.bck = init_head;
+
+	/* queue active and backup servers in two distinct groups */
+	for (srv = p->srv; srv; srv = srv->next) {
+		if (!srv_currently_usable(srv))
+			continue;
+		srv->lb_tree = (srv->flags & SRV_F_BACKUP) ? &p->lbprm.pewma.bck : &p->lbprm.pewma.act;
+		pewma_queue_srv(srv, srv->next_eweight);
+	}
+}
+
+/* Return next server from the Peak EWMA tree in backend <p>. If the tree is
+ * empty, return NULL. Saturated servers are skipped.
+ *
+ * The lbprm's lock will be used in R/O mode. The server's lock is not used.
+ */
+struct server *pewma_get_next_server(struct proxy *p, struct server *srvtoavoid)
+{
+	struct server *srv, *avoided;
+	struct eb32_node *node;
+	uint64_t curseq;
+	int found = 0;
+
+	srv = avoided = NULL;
+
+	HA_RWLOCK_RDLOCK(LBPRM_LOCK, &p->lbprm.lock);
+	curseq = _HA_ATOMIC_LOAD(&p->lbprm.lb_seq);
+ redo:
+	if (p->srv_act)
+		node = eb32_lookup_ge(&p->lbprm.pewma.act, PEWMA_LBPRM_SMALLEST(curseq));
+	else if (p->lbprm.fbck) {
+		srv = p->lbprm.fbck;
+		goto out;
+	}
+	else if (p->srv_bck)
+		node = eb32_lookup_ge(&p->lbprm.pewma.bck, PEWMA_LBPRM_SMALLEST(curseq));
+	else {
+		srv = NULL;
+		goto out;
+	}
+
+	while (node) {
+		struct fwlc_tree_elt *tree_elt;
+		struct server *s;
+		int unusable = 0;
+		int orig_nb;
+		int i = 0;
+
+		tree_elt = eb32_entry(node, struct fwlc_tree_elt, lb_node);
+		orig_nb = statistical_prng_range(PEWMA_LISTS_NB);
+
+		while (_HA_ATOMIC_LOAD(&tree_elt->elements) > unusable) {
+			struct mt_list mt_list;
+			mt_list.next = _HA_ATOMIC_LOAD(&tree_elt->srv_list[(i + orig_nb) % PEWMA_LISTS_NB].next);
+
+			if (mt_list.next != &tree_elt->srv_list[(i + orig_nb) % PEWMA_LISTS_NB] && mt_list.next != MT_LIST_BUSY) {
+				unsigned int eweight;
+				unsigned int planned_inflight;
+				unsigned int lat;
+				s = container_of(mt_list.next, struct server, lb_mt_list);
+				eweight = _HA_ATOMIC_LOAD(&s->cur_eweight);
+
+				/* Reverse the key computation to get planned inflight.
+				 * key = latency * (inflight + 1) * SRV_EWGHT_MAX / eweight
+				 * => inflight + 1 = key * eweight / (SRV_EWGHT_MAX * latency)
+				 */
+				lat = swrate_avg(_HA_ATOMIC_LOAD(&s->counters.t_time), TIME_STATS_SAMPLES);
+				if (!lat) lat = 1;
+				planned_inflight = (unsigned long long)tree_elt->lb_node.key * eweight / ((unsigned long long)SRV_EWGHT_MAX * lat);
+
+				if (!s->maxconn || s->served + s->queueslength < srv_dynamic_maxconn(s) + s->maxqueue) {
+					if (_HA_ATOMIC_LOAD(&s->served) + _HA_ATOMIC_LOAD(&s->queueslength) > planned_inflight + 2) {
+						/*
+						 * The server has more requests than expected,
+						 * let's try to reposition it, to avoid too
+						 * many threads using the same server at the
+						 * same time. From the moment we release the
+						 * lock, we cannot trust the node nor tree_elt
+						 * anymore, so we need to loop back to the
+						 * beginning.
+						 */
+						if (i >= PEWMA_LISTS_NB) {
+							HA_RWLOCK_RDUNLOCK(LBPRM_LOCK, &p->lbprm.lock);
+							pewma_srv_reposition(s);
+							HA_RWLOCK_RDLOCK(LBPRM_LOCK, &p->lbprm.lock);
+							goto redo;
+						}
+						i++;
+						continue;
+					}
+                                        if (s != srvtoavoid) {
+                                                srv = s;
+                                                found = 1;
+                                                break;
+                                        }
+					avoided = s;
+				}
+				unusable++;
+				i++;
+			} else if (mt_list.next == &tree_elt->srv_list[(i + orig_nb) % PEWMA_LISTS_NB]) {
+				i++;
+				continue;
+			} else {
+				i++;
+				continue;
+			}
+		}
+		if (found)
+			break;
+
+		do {
+			node = eb32_next(node);
+		} while (node && node->key < PEWMA_LBPRM_SMALLEST(curseq));
+
+		if (node) {
+			uint64_t newcurseq = HA_ATOMIC_LOAD(&p->lbprm.lb_seq);
+
+			/*
+			 * If we have a bigger element than the smallest recorded, and we're up to date,
+			 * update the smallest one.
+			 */
+			if (likely(newcurseq == curseq && PEWMA_LBPRM_SMALLEST(newcurseq) < node->key)) {
+				if (pewma_set_seq_and_smallest(&p->lbprm, curseq, PEWMA_LBPRM_SEQ(curseq), node->key) != 0) {
+					curseq = PEWMA_LBPRM_SEQ(curseq) | ((uint64_t)node->key << 32);
+					__ha_barrier_store();
+					continue;
+				}
+
+			}
+			/*
+			 * Somebody added a new server in node we already skipped, so retry from the beginning.
+			 */
+			if (unlikely(PEWMA_LBPRM_SMALLEST(newcurseq) < node->key && PEWMA_LBPRM_SEQ(newcurseq) != PEWMA_LBPRM_SEQ(curseq))) {
+				curseq = newcurseq;
+				goto redo;
+			}
+			curseq = newcurseq;
+		} else {
+			uint64_t newcurseq = _HA_ATOMIC_LOAD(&p->lbprm.lb_seq);
+
+			/*
+			 * No more node, but somebody changed the tree, so it's
+			 * worth trying again.
+			 */
+			if (PEWMA_LBPRM_SEQ(newcurseq) != PEWMA_LBPRM_SEQ(curseq)) {
+				curseq = newcurseq;
+				goto redo;
+			}
+		}
+	}
+
+	if (!srv)
+		srv = avoided;
+ out:
+	HA_RWLOCK_RDUNLOCK(LBPRM_LOCK, &p->lbprm.lock);
+
+	return srv;
+}
+
+
+/*
+ * Local variables:
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ * End:
+ */
-- 
2.43.0

From 84e9c1e88bf21df2ca1ef9816ba110677a1ab0ba Mon Sep 17 00:00:00 2001
From: Aleksandar Lazic <[email protected]>
Date: Fri, 6 Feb 2026 00:59:31 +0100
Subject: [PATCH 07/10] MEDIUM/backend: Add Peak EWMA load balancing algorithm

refer to https://github.com/haproxy/haproxy/issues/1570

Signed-off-by: Aleksandar Lazic <[email protected]>
---
 include/haproxy/lb_pewma.h | 40 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 include/haproxy/lb_pewma.h

diff --git a/include/haproxy/lb_pewma.h b/include/haproxy/lb_pewma.h
new file mode 100644
index 0000000000..aad9814c7f
--- /dev/null
+++ b/include/haproxy/lb_pewma.h
@@ -0,0 +1,40 @@
+/*
+ * include/haproxy/lb_pewma.h
+ * Peak EWMA load balancing algorithm.
+ *
+ * Copyright 2026 Aleksandar Lazic <[email protected]>
+ * 
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation, version 2.1
+ * exclusively.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef _HAPROXY_LB_PEWMA_H
+#define _HAPROXY_LB_PEWMA_H
+
+#include <haproxy/api.h>
+#include <haproxy/lb_pewma-t.h>
+#include <haproxy/proxy-t.h>
+#include <haproxy/server-t.h>
+
+void pewma_init_server_tree(struct proxy *p);
+struct server *pewma_get_next_server(struct proxy *p, struct server *srvtoavoid);
+
+#endif /* _HAPROXY_LB_PEWMA_H */
+
+/*
+ * Local variables:
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ * End:
+ */
-- 
2.43.0

From bd524ccac26ae35567f0a62a145f54ef58ec6936 Mon Sep 17 00:00:00 2001
From: Aleksandar Lazic <[email protected]>
Date: Fri, 6 Feb 2026 00:59:16 +0100
Subject: [PATCH 06/10] MEDIUM/backend: Add Peak EWMA load balancing algorithm

refer to https://github.com/haproxy/haproxy/issues/1570

Signed-off-by: Aleksandar Lazic <[email protected]>
---
 include/haproxy/lb_pewma-t.h | 39 ++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 include/haproxy/lb_pewma-t.h

diff --git a/include/haproxy/lb_pewma-t.h b/include/haproxy/lb_pewma-t.h
new file mode 100644
index 0000000000..9bef349521
--- /dev/null
+++ b/include/haproxy/lb_pewma-t.h
@@ -0,0 +1,39 @@
+/*
+ * include/haproxy/lb_pewma-t.h
+ * Types for Peak EWMA load balancing algorithm.
+ *
+ * Copyright 2026 Aleksandar Lazic <[email protected]>
+ * 
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation, version 2.1
+ * exclusively.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef _HAPROXY_LB_PEWMA_T_H
+#define _HAPROXY_LB_PEWMA_T_H
+
+#include <import/ebtree-t.h>
+
+struct lb_pewma {
+	struct eb_root act;	/* peak ewma tree on the active servers */
+	struct eb_root bck;	/* peak ewma tree on the backup servers */
+};
+
+#endif /* _HAPROXY_LB_PEWMA_T_H */
+
+/*
+ * Local variables:
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ * End:
+ */
-- 
2.43.0

From 0d2f9ebd82d41fbea495e9d858cff4038d198548 Mon Sep 17 00:00:00 2001
From: Aleksandar Lazic <[email protected]>
Date: Fri, 6 Feb 2026 00:58:58 +0100
Subject: [PATCH 05/10] MEDIUM/backend: Add Peak EWMA load balancing algorithm

refer to https://github.com/haproxy/haproxy/issues/1570

Signed-off-by: Aleksandar Lazic <[email protected]>
---
 src/cfgparse.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/cfgparse.c b/src/cfgparse.c
index 52a4cb8fe0..3a6846c531 100644
--- a/src/cfgparse.c
+++ b/src/cfgparse.c
@@ -66,6 +66,7 @@
 #include <haproxy/lb_chash.h>
 #include <haproxy/lb_fas.h>
 #include <haproxy/lb_fwlc.h>
+#include <haproxy/lb_pewma.h>
 #include <haproxy/lb_fwrr.h>
 #include <haproxy/lb_map.h>
 #include <haproxy/lb_ss.h>
@@ -3413,6 +3414,9 @@ int check_config_validity()
 			if ((curproxy->lbprm.algo & BE_LB_PARM) == BE_LB_CB_LC) {
 				curproxy->lbprm.algo |= BE_LB_LKUP_LCTREE | BE_LB_PROP_DYN;
 				fwlc_init_server_tree(curproxy);
+			} else if ((curproxy->lbprm.algo & BE_LB_PARM) == BE_LB_CB_PE) {
+				curproxy->lbprm.algo |= BE_LB_LKUP_PETREE | BE_LB_PROP_DYN;
+				pewma_init_server_tree(curproxy);
 			} else {
 				curproxy->lbprm.algo |= BE_LB_LKUP_FSTREE | BE_LB_PROP_DYN;
 				fas_init_server_tree(curproxy);
-- 
2.43.0

From 816700e686bb173886e6ef4de7105e945bdf3e04 Mon Sep 17 00:00:00 2001
From: Aleksandar Lazic <[email protected]>
Date: Fri, 6 Feb 2026 00:58:43 +0100
Subject: [PATCH 04/10] MEDIUM/backend: Add Peak EWMA load balancing algorithm

refer to https://github.com/haproxy/haproxy/issues/1570

Signed-off-by: Aleksandar Lazic <[email protected]>
---
 src/backend.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/backend.c b/src/backend.c
index 73b39306b9..47ea1c1b1e 100644
--- a/src/backend.c
+++ b/src/backend.c
@@ -38,6 +38,7 @@
 #include <haproxy/lb_chash.h>
 #include <haproxy/lb_fas.h>
 #include <haproxy/lb_fwlc.h>
+#include <haproxy/lb_pewma.h>
 #include <haproxy/lb_fwrr.h>
 #include <haproxy/lb_map.h>
 #include <haproxy/lb_ss.h>
@@ -714,6 +715,10 @@ int assign_server(struct stream *s)
 			srv = fwlc_get_next_server(s->be, prev_srv);
 			break;
 
+		case BE_LB_LKUP_PETREE:
+			srv = pewma_get_next_server(s->be, prev_srv);
+			break;
+
 		case BE_LB_LKUP_CHTREE:
 		case BE_LB_LKUP_MAP:
 			if ((s->be->lbprm.algo & BE_LB_KIND) == BE_LB_KIND_RR) {
@@ -3070,6 +3075,8 @@ const char *backend_lb_algo_str(int algo) {
 		return "first";
 	else if (algo == BE_LB_ALGO_LC)
 		return "leastconn";
+	else if (algo == BE_LB_ALGO_PE)
+		return "peak-ewma";
 	else if (algo == BE_LB_ALGO_SH)
 		return "source";
 	else if (algo == BE_LB_ALGO_UH)
@@ -3120,6 +3127,10 @@ int backend_parse_balance(const char **args, char **err, struct proxy *curproxy)
 		curproxy->lbprm.algo &= ~BE_LB_ALGO;
 		curproxy->lbprm.algo |= BE_LB_ALGO_LC;
 	}
+	else if (strcmp(args[0], "peak-ewma") == 0) {
+		curproxy->lbprm.algo &= ~BE_LB_ALGO;
+		curproxy->lbprm.algo |= BE_LB_ALGO_PE;
+	}
 	else if (!strncmp(args[0], "random", 6)) {
 		curproxy->lbprm.algo &= ~BE_LB_ALGO;
 		curproxy->lbprm.algo |= BE_LB_ALGO_RND;
@@ -3300,7 +3311,7 @@ int backend_parse_balance(const char **args, char **err, struct proxy *curproxy)
 		curproxy->lbprm.algo |= BE_LB_ALGO_SS;
 	}
 	else {
-		memprintf(err, "only supports 'roundrobin', 'static-rr', 'leastconn', 'source', 'uri', 'url_param', 'hash', 'hdr(name)', 'rdp-cookie(name)', 'log-hash' and 'sticky' options.");
+		memprintf(err, "only supports 'roundrobin', 'static-rr', 'leastconn', 'peak-ewma', 'source', 'uri', 'url_param', 'hash', 'hdr(name)', 'rdp-cookie(name)', 'log-hash' and 'sticky' options.");
 		return -1;
 	}
 	return 0;
-- 
2.43.0

From ef07ec76a4eccae9e904234778e9fb8b22ebc0d8 Mon Sep 17 00:00:00 2001
From: Aleksandar Lazic <[email protected]>
Date: Fri, 6 Feb 2026 00:58:24 +0100
Subject: [PATCH 03/10] MEDIUM/backend: Add Peak EWMA load balancing algorithm

refer to https://github.com/haproxy/haproxy/issues/1570

Signed-off-by: Aleksandar Lazic <[email protected]>
---
 include/haproxy/defaults.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/include/haproxy/defaults.h b/include/haproxy/defaults.h
index 04eae5df06..de855b50e7 100644
--- a/include/haproxy/defaults.h
+++ b/include/haproxy/defaults.h
@@ -661,6 +661,18 @@
 #define FWLC_MIN_FREE_ENTRIES 500
 #endif /* FWLC_MIN_FREE_ENTRIES */
 
+/*
+ * Peak EWMA tree constants. Same structure as FWLC but separated
+ * for independent tuning.
+ */
+#ifndef PEWMA_LISTS_NB
+#define PEWMA_LISTS_NB   4
+#endif /* PEWMA_LISTS_NB */
+
+#ifndef PEWMA_MIN_FREE_ENTRIES
+#define PEWMA_MIN_FREE_ENTRIES 500
+#endif /* PEWMA_MIN_FREE_ENTRIES */
+
 /*
  * QUIC
  */
-- 
2.43.0

From ab0fa769d7ba17d97adfc168a5a4abbc3f952a2e Mon Sep 17 00:00:00 2001
From: Aleksandar Lazic <[email protected]>
Date: Fri, 6 Feb 2026 00:58:00 +0100
Subject: [PATCH 02/10] MEDIUM/backend: Add Peak EWMA load balancing algorithm

refer to https://github.com/haproxy/haproxy/issues/1570

Signed-off-by: Aleksandar Lazic <[email protected]>
---
 include/haproxy/backend-t.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/haproxy/backend-t.h b/include/haproxy/backend-t.h
index 35f882f28a..9bea12e48c 100644
--- a/include/haproxy/backend-t.h
+++ b/include/haproxy/backend-t.h
@@ -26,6 +26,7 @@
 #include <haproxy/lb_chash-t.h>
 #include <haproxy/lb_fas-t.h>
 #include <haproxy/lb_fwlc-t.h>
+#include <haproxy/lb_pewma-t.h>
 #include <haproxy/lb_fwrr-t.h>
 #include <haproxy/lb_map-t.h>
 #include <haproxy/lb_ss-t.h>
@@ -58,6 +59,7 @@
 /* BE_LB_CB_* is used with BE_LB_KIND_CB */
 #define BE_LB_CB_LC     0x00000000  /* least-connections */
 #define BE_LB_CB_FAS    0x00000001  /* first available server (opposite of leastconn) */
+#define BE_LB_CB_PE     0x00000002  /* peak ewma */
 
 /* BE_LB_SA_* is used with BE_LB_KIND_SA */
 #define BE_LB_SA_SS     0x00000000  /* stick to server as long as it is available */
@@ -88,6 +90,7 @@
 #define BE_LB_ALGO_RND  (BE_LB_KIND_RR | BE_LB_NEED_NONE | BE_LB_RR_RANDOM) /* random value */
 #define BE_LB_ALGO_LC   (BE_LB_KIND_CB | BE_LB_NEED_NONE | BE_LB_CB_LC)    /* least connections */
 #define BE_LB_ALGO_FAS  (BE_LB_KIND_CB | BE_LB_NEED_NONE | BE_LB_CB_FAS)   /* first available server */
+#define BE_LB_ALGO_PE   (BE_LB_KIND_CB | BE_LB_NEED_NONE | BE_LB_CB_PE)    /* peak ewma */
 #define BE_LB_ALGO_SS   (BE_LB_KIND_SA | BE_LB_NEED_NONE | BE_LB_SA_SS)    /* sticky */
 #define BE_LB_ALGO_SRR  (BE_LB_KIND_RR | BE_LB_NEED_NONE | BE_LB_RR_STATIC) /* static round robin */
 #define BE_LB_ALGO_SH	(BE_LB_KIND_HI | BE_LB_NEED_ADDR | BE_LB_HASH_SRC) /* hash: source IP */
@@ -109,6 +112,7 @@
 #define BE_LB_LKUP_LCTREE 0x00300000  /* FWLC tree lookup */
 #define BE_LB_LKUP_CHTREE 0x00400000  /* consistent hash  */
 #define BE_LB_LKUP_FSTREE 0x00500000  /* FAS tree lookup */
+#define BE_LB_LKUP_PETREE 0x00600000  /* Peak EWMA tree lookup */
 #define BE_LB_LKUP        0x00700000  /* mask to get just the LKUP value */
 
 /* additional properties */
@@ -158,6 +162,7 @@ struct lbprm {
 		struct lb_fwlc fwlc;
 		struct lb_chash chash;
 		struct lb_fas fas;
+		struct lb_pewma pewma;
 		struct lb_ss ss;
 	};
 	uint32_t algo;			/* load balancing algorithm and variants: BE_LB_* */
-- 
2.43.0

From 1e67db8482e8657c007223f0f31a6648d8a9c17a Mon Sep 17 00:00:00 2001
From: Aleksandar Lazic <[email protected]>
Date: Fri, 6 Feb 2026 00:54:58 +0100
Subject: [PATCH 01/10] MEDIUM/backend: Add Peak EWMA load balancing algorithm

refer to https://github.com/haproxy/haproxy/issues/1570

Signed-off-by: Aleksandar Lazic <[email protected]>
---
 Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 006875048c..0415c7b86e 100644
--- a/Makefile
+++ b/Makefile
@@ -989,7 +989,8 @@ OBJS += src/mux_h2.o src/mux_h1.o src/mux_fcgi.o src/log.o		\
         src/raw_sock.o src/action.o src/stats-file.o src/buf.o		\
         src/xprt_handshake.o src/proto_uxst.o src/lb_fwrr.o		\
         src/uri_normalizer.o src/mailers.o src/protocol.o		\
-        src/cfgcond.o src/proto_udp.o src/lb_fwlc.o src/ebmbtree.o	\
+        src/cfgcond.o src/proto_udp.o src/lb_fwlc.o src/lb_pewma.o	\
+        src/ebmbtree.o	\
         src/proto_uxdg.o src/cfgdiag.o src/sock_unix.o src/sha1.o	\
         src/lb_fas.o src/clock.o src/sock_inet.o src/ev_select.o	\
         src/lb_map.o src/shctx.o src/hpack-dec.o src/net_helper.o       \
-- 
2.43.0

From 509f7c9e5309c35767acaac2db7c7caa8f98f25a Mon Sep 17 00:00:00 2001
From: Aleksandar Lazic <[email protected]>
Date: Fri, 6 Feb 2026 08:44:41 +0100
Subject: [PATCH 09/10] MEDIUM/backend: Add Peak EWMA load balancing algorithm

refer to https://github.com/haproxy/haproxy/issues/1570

Signed-off-by: Aleksandar Lazic <[email protected]>
---
 reg-tests/balance/balance-peak-ewma.vtc | 141 ++++++++++++++++++++++++
 1 file changed, 141 insertions(+)
 create mode 100644 reg-tests/balance/balance-peak-ewma.vtc

diff --git a/reg-tests/balance/balance-peak-ewma.vtc b/reg-tests/balance/balance-peak-ewma.vtc
new file mode 100644
index 0000000000..dbc0252b1c
--- /dev/null
+++ b/reg-tests/balance/balance-peak-ewma.vtc
@@ -0,0 +1,141 @@
+vtest "Test for balance peak-ewma"
+feature ignore_unknown_macro
+
+# This test verifies that the peak-ewma load balancing algorithm
+# correctly accepts configuration and handles requests. With no prior
+# response time data (cold start), latency defaults to 1 and the
+# algorithm degrades to weighted least connections behavior.
+
+# Test 1: Basic functionality - verify config parsing and request handling
+
+server s1 {
+    rxreq
+    txresp -hdr "Server: s1"
+} -repeat 6 -start
+
+server s2 {
+    rxreq
+    txresp -hdr "Server: s2"
+} -repeat 6 -start
+
+server s3 {
+    rxreq
+    txresp -hdr "Server: s3"
+} -repeat 6 -start
+
+haproxy h1 -arg "-L A" -conf {
+    global
+    .if feature(THREAD)
+        thread-groups 1
+    .endif
+
+    defaults
+        mode http
+        timeout server "${HAPROXY_TEST_TIMEOUT-5s}"
+        timeout connect "${HAPROXY_TEST_TIMEOUT-5s}"
+        timeout client "${HAPROXY_TEST_TIMEOUT-5s}"
+
+    listen px
+        bind "fd@${px}"
+        balance peak-ewma
+        server srv1 ${s1_addr}:${s1_port}
+        server srv2 ${s2_addr}:${s2_port}
+        server srv3 ${s3_addr}:${s3_port}
+} -start
+
+# Send sequential requests and verify they all succeed
+client c1 -connect ${h1_px_sock} {
+    txreq -url "/req1"
+    rxresp
+    expect resp.status == 200
+} -run
+
+client c2 -connect ${h1_px_sock} {
+    txreq -url "/req2"
+    rxresp
+    expect resp.status == 200
+} -run
+
+client c3 -connect ${h1_px_sock} {
+    txreq -url "/req3"
+    rxresp
+    expect resp.status == 200
+} -run
+
+client c4 -connect ${h1_px_sock} {
+    txreq -url "/req4"
+    rxresp
+    expect resp.status == 200
+} -run
+
+client c5 -connect ${h1_px_sock} {
+    txreq -url "/req5"
+    rxresp
+    expect resp.status == 200
+} -run
+
+client c6 -connect ${h1_px_sock} {
+    txreq -url "/req6"
+    rxresp
+    expect resp.status == 200
+} -run
+
+# Test 2: Concurrent requests - when one server is busy, the algorithm
+# should prefer idle servers. Use barriers to ensure c8 starts while
+# c7's request is still in-flight on one server.
+
+barrier b1 cond 2
+barrier b2 cond 2
+
+server s4 {
+    rxreq
+    barrier b2 sync
+    barrier b1 sync
+    txresp -hdr "Server: s4"
+} -start
+
+server s5 {
+    rxreq
+    barrier b1 sync
+    txresp -hdr "Server: s5"
+} -start
+
+haproxy h2 -arg "-L A" -conf {
+    global
+    .if feature(THREAD)
+        thread-groups 1
+    .endif
+
+    defaults
+        mode http
+        timeout server "${HAPROXY_TEST_TIMEOUT-5s}"
+        timeout connect "${HAPROXY_TEST_TIMEOUT-5s}"
+        timeout client "${HAPROXY_TEST_TIMEOUT-5s}"
+
+    listen px2
+        bind "fd@${px2}"
+        balance peak-ewma
+        server srv4 ${s4_addr}:${s4_port}
+        server srv5 ${s5_addr}:${s5_port}
+} -start
+
+# c7 sends a request that will block on s4 (waiting for barrier)
+client c7 -connect ${h2_px2_sock} {
+    txreq -url "/slow"
+    rxresp
+    expect resp.status == 200
+} -start
+
+# Wait for c7's request to reach s4
+barrier b2 sync
+
+# c8 sends while c7 is still in-flight on s4; peak-ewma should
+# prefer s5 since s4 has higher inflight count
+client c8 -connect ${h2_px2_sock} {
+    txreq -url "/fast"
+    rxresp
+    expect resp.status == 200
+    expect resp.http.Server ~ s5
+} -run
+
+client c7 -wait
-- 
2.43.0

Re: Patch series for New Balancing algorithm (Peak) EWMA

Reply via email to