From 3c8e3e34dc79109394b948330dfa4e32666b57e0 Mon Sep 17 00:00:00 2001
From: Frank Heikens <fheikens@users.noreply.github.com>
Date: Thu, 2 Apr 2026 21:01:19 -0700
Subject: [PATCH] Throttle hash_agg_check_limits to avoid O(N^2) context
 traversal

hash_agg_check_limits() calls MemoryContextMemAllocated() with
recurse=true after every new hash group addition.  This traverses
all child memory contexts to sum their mem_allocated, which is O(C)
where C is the number of child contexts.

Some aggregate transition functions create per-group child contexts.
For example, array_append (when used as an aggregate SFUNC) creates
an expanded-array object with a private AllocSet for each group.
With N groups the total traversal cost is O(N * C) = O(N^2), which
becomes the dominant cost once tens of thousands of groups are
present in the hash table.

On a reproducer with 50K groups using a user-defined array_agg
with array_append as SFUNC, the unthrottled check accounts for
roughly 97% of the query time: ~5 seconds versus ~150 ms for the
actual aggregate computation.

Fix by throttling the recursive memory check: once the group count
exceeds 1024, only perform the full check every 1024 new groups.
This bounds the spill-detection latency to at most 1024 groups'
worth of memory while keeping the per-check overhead constant.

Benchmarks on 50K groups: 4.6 s -> 152 ms (30x).
Benchmarks on 100K groups: 35 s -> 2.9 s (12x).
Benchmarks on 500K groups: 672 s -> 1.7 s (395x).
No regressions observed for built-in aggregates, non-array aggregates,
or batched (low work_mem) workloads.  Spill behavior (batch count,
disk usage) is unchanged.
---
 src/backend/executor/nodeAgg.c | 38 +++++++++++++++++++++++++++-------
 1 file changed, 31 insertions(+), 7 deletions(-)

diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index 925caadd2ce..9a23dedeccd 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -1861,20 +1861,44 @@ hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits,
  * After adding a new group to the hash table, check whether we need to enter
  * spill mode. Allocations may happen without adding new groups (for instance,
  * if the transition state size grows), so this check is imperfect.
+ *
+ * Note: MemoryContextMemAllocated with recurse=true traverses all child
+ * contexts, making it O(C) where C is the child count.  Some aggregate
+ * transition functions create per-group child contexts (for example,
+ * array_append creates expanded-array objects each owning a private
+ * AllocSet).  Calling this function after every new group would then be
+ * O(N*C) ≈ O(N^2) total, which becomes the dominant cost with tens of
+ * thousands of groups.  We avoid that by throttling: once there are more
+ * than 1024 groups, we only run the full check every 1024th group.  This
+ * caps the overshoot to at most 1024 groups' worth of memory before spill
+ * mode is entered.
  */
 static void
 hash_agg_check_limits(AggState *aggstate)
 {
 	uint64		ngroups = aggstate->hash_ngroups_current;
-	Size		meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt,
-													 true);
-	Size		entry_mem = MemoryContextMemAllocated(aggstate->hash_tuplescxt,
-													  true);
-	Size		tval_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory,
-													 true);
-	Size		total_mem = meta_mem + entry_mem + tval_mem;
+	Size		meta_mem;
+	Size		entry_mem;
+	Size		tval_mem;
+	Size		total_mem;
 	bool		do_spill = false;
 
+	/*
+	 * Throttle the expensive recursive MemoryContextMemAllocated calls.
+	 * Below 1024 groups the quadratic cost is negligible; above that we
+	 * check every 1024 groups to keep the overhead linear.
+	 */
+	if (ngroups > 1024 && (ngroups & 0x3FF) != 0)
+		return;
+
+	meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt,
+										 true);
+	entry_mem = MemoryContextMemAllocated(aggstate->hash_tuplescxt,
+										  true);
+	tval_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory,
+										 true);
+	total_mem = meta_mem + entry_mem + tval_mem;
+
 #ifdef USE_INJECTION_POINTS
 	if (ngroups >= 1000)
 	{
-- 
2.33.0

