On 27/07/2025 8:24 PM, Konstantin Knizhnik wrote:
I still trying to understand the reason of DSA overflow in hash join.
In addition to two suspicious places where number of buckets is
doubled without chek for overflow (nodeHash.c:1668 and nodeHash.c:3290),
there is one more place where number of batches is multiplied by
`EstimateParallelHashJoinBatch(hashtable)` which is
sizeof(ParallelHashJoinBatch) + (sizeof(SharedTuplestore) +
sizeof(SharedTuplestoreParticipant) * participants) * 2
which is 480 bytes!
But when we calculate maximal number of batches, we limit it by
macximal number of pointers (8 bytes):
max_pointers = hash_table_bytes / sizeof(HashJoinTuple);
max_pointers = Min(max_pointers, MaxAllocSize /
sizeof(HashJoinTuple));
/* If max_pointers isn't a power of 2, must round it down to one */
max_pointers = pg_prevpower2_size_t(max_pointers);
/* Also ensure we avoid integer overflow in nbatch and nbuckets */
/* (this step is redundant given the current value of MaxAllocSize) */
max_pointers = Min(max_pointers, INT_MAX / 2 + 1);
dbuckets = ceil(ntuples / NTUP_PER_BUCKET);
dbuckets = Min(dbuckets, max_pointers);
nbuckets = (int) dbuckets;
But as we see, here multiplier is 480 bytes, not 8 bytes.
Below is script to reproduce the problem:
CREATE TABLE IF NOT EXISTS t0(c0 FLOAT, PRIMARY KEY(c0)) WITH
(parallel_workers=966);
CREATE TABLE t2(c0 DECIMAL, c1 int4range ) WITH (parallel_workers=393);
CREATE TABLE t4(LIKE t2);
CREATE TABLE t5(LIKE t0);
INSERT INTO t4(c0) VALUES(0.5934077416223362);
set work_mem='10MB';
set max_parallel_workers_per_gather=5;
explain SELECT SUM(count) FROM (SELECT ALL CAST(FALSE AS INT) as count
FROM ONLY t5, ONLY t2 CROSS JOIN ONLY t0 LEFT OUTER JOIN t4* ON
(upper(((t2.c1)+(t2.c1))))::BOOLEAN CROSS JOIN (SELECT t4.c0 FROM ONLY
t0, t2*, t5*, t4* WHERE (((t2.c1)*(t2.c1))) IN (t4.c1)) AS sub0) as res;
SELECT SUM(count) FROM (SELECT ALL CAST(FALSE AS INT) as count FROM ONLY
t5, ONLY t2 CROSS JOIN ONLY t0 LEFT OUTER JOIN t4* ON
(upper(((t2.c1)+(t2.c1))))::BOOLEAN CROSS JOIN (SELECT t4.c0 FROM ONLY
t0, t2*, t5*, t4* WHERE (((t2.c1)*(t2.c1))) IN (t4.c1)) AS sub0) as res;
And attached please find patch fixing the issue.
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 8d2201ab67f..c93e24fccf2 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -35,6 +35,7 @@
#include "executor/nodeHash.h"
#include "executor/nodeHashjoin.h"
#include "miscadmin.h"
+#include "optimizer/cost.h"
#include "port/pg_bitutils.h"
#include "utils/dynahash.h"
#include "utils/lsyscache.h"
@@ -668,6 +669,7 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool
useskew,
size_t hash_table_bytes;
size_t bucket_bytes;
size_t max_pointers;
+ size_t max_batches;
int nbatch = 1;
int nbuckets;
double dbuckets;
@@ -775,6 +777,16 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool
useskew,
/* If max_pointers isn't a power of 2, must round it down to one */
max_pointers = pg_prevpower2_size_t(max_pointers);
+ /*
+ * Prevent DSA overflow. This is expanded definition of
EstimateParallelHashJoinBatch function
+ * used in ExecParallelHashJoinSetUpBatches:
+ * dsa_allocate0(hashtable->area,
+ * EstimateParallelHashJoinBatch(hashtable) * nbatch)
+ */
+ max_batches = MaxAllocSize / (MAXALIGN(sizeof(ParallelHashJoinBatch)) +
+
MAXALIGN(sts_estimate(max_parallel_workers_per_gather + 1) * 2));
+ max_batches = pg_prevpower2_size_t(max_batches);
+
/* Also ensure we avoid integer overflow in nbatch and nbuckets */
/* (this step is redundant given the current value of MaxAllocSize) */
max_pointers = Min(max_pointers, INT_MAX / 2 + 1);
@@ -844,6 +856,7 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool
useskew,
/* Calculate required number of batches. */
dbatch = ceil(inner_rel_bytes / (hash_table_bytes -
bucket_bytes));
dbatch = Min(dbatch, max_pointers);
+ dbatch = Min(dbatch, max_batches);
minbatch = (int) dbatch;
nbatch = pg_nextpower2_32(Max(2, minbatch));
}
@@ -910,7 +923,8 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool
useskew,
* this during the initial sizing - once we start building the hash,
* nbucket is fixed.
*/
- while (nbatch > 0)
+ while (nbatch > 0 &&
+ nbuckets * 2 <= max_pointers) /* prevent allocation limit
overflow */
{
/* how much memory are we using with current nbatch value */
size_t current_space = hash_table_bytes + (2 * nbatch
* BLCKSZ);