On 27/07/2025 8:24 PM, Konstantin Knizhnik wrote:

I still trying to understand the reason of DSA overflow in hash join.
In addition to two suspicious places where number of buckets is doubled without chek for overflow (nodeHash.c:1668 and nodeHash.c:3290), there is one  more place  where number of batches is multiplied by `EstimateParallelHashJoinBatch(hashtable)` which is

sizeof(ParallelHashJoinBatch) + (sizeof(SharedTuplestore)  + sizeof(SharedTuplestoreParticipant) * participants) * 2

which is 480 bytes!

But when we calculate maximal number of batches, we limit it by macximal number of pointers (8 bytes):

    max_pointers = hash_table_bytes / sizeof(HashJoinTuple);
    max_pointers = Min(max_pointers, MaxAllocSize / sizeof(HashJoinTuple));
    /* If max_pointers isn't a power of 2, must round it down to one */
    max_pointers = pg_prevpower2_size_t(max_pointers);

    /* Also ensure we avoid integer overflow in nbatch and nbuckets */
    /* (this step is redundant given the current value of MaxAllocSize) */
    max_pointers = Min(max_pointers, INT_MAX / 2 + 1);

    dbuckets = ceil(ntuples / NTUP_PER_BUCKET);
    dbuckets = Min(dbuckets, max_pointers);
    nbuckets = (int) dbuckets;


But as we see, here multiplier is 480 bytes, not 8 bytes.


Below is script to reproduce the problem:

CREATE TABLE IF NOT EXISTS t0(c0 FLOAT, PRIMARY KEY(c0)) WITH (parallel_workers=966);
CREATE TABLE t2(c0 DECIMAL, c1 int4range ) WITH (parallel_workers=393);
CREATE TABLE t4(LIKE t2);
CREATE TABLE t5(LIKE t0);
INSERT INTO t4(c0) VALUES(0.5934077416223362);

set work_mem='10MB';
set max_parallel_workers_per_gather=5;

explain SELECT SUM(count) FROM (SELECT ALL CAST(FALSE AS INT) as count FROM ONLY t5, ONLY t2 CROSS JOIN ONLY t0 LEFT OUTER JOIN t4* ON (upper(((t2.c1)+(t2.c1))))::BOOLEAN CROSS JOIN (SELECT t4.c0 FROM ONLY t0, t2*, t5*, t4* WHERE (((t2.c1)*(t2.c1))) IN (t4.c1)) AS sub0) as res;

SELECT SUM(count) FROM (SELECT ALL CAST(FALSE AS INT) as count FROM ONLY t5, ONLY t2 CROSS JOIN ONLY t0 LEFT OUTER JOIN t4* ON (upper(((t2.c1)+(t2.c1))))::BOOLEAN CROSS JOIN (SELECT t4.c0 FROM ONLY t0, t2*, t5*, t4* WHERE (((t2.c1)*(t2.c1))) IN (t4.c1)) AS sub0) as res;


And attached please find patch fixing the issue.
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 8d2201ab67f..c93e24fccf2 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -35,6 +35,7 @@
 #include "executor/nodeHash.h"
 #include "executor/nodeHashjoin.h"
 #include "miscadmin.h"
+#include "optimizer/cost.h"
 #include "port/pg_bitutils.h"
 #include "utils/dynahash.h"
 #include "utils/lsyscache.h"
@@ -668,6 +669,7 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool 
useskew,
        size_t          hash_table_bytes;
        size_t          bucket_bytes;
        size_t          max_pointers;
+       size_t          max_batches;
        int                     nbatch = 1;
        int                     nbuckets;
        double          dbuckets;
@@ -775,6 +777,16 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool 
useskew,
        /* If max_pointers isn't a power of 2, must round it down to one */
        max_pointers = pg_prevpower2_size_t(max_pointers);
 
+       /*
+        * Prevent DSA overflow. This is expanded definition of 
EstimateParallelHashJoinBatch function
+        * used in ExecParallelHashJoinSetUpBatches:
+        *     dsa_allocate0(hashtable->area,
+        *                   EstimateParallelHashJoinBatch(hashtable) * nbatch)
+        */
+       max_batches = MaxAllocSize / (MAXALIGN(sizeof(ParallelHashJoinBatch)) +
+                                                                 
MAXALIGN(sts_estimate(max_parallel_workers_per_gather + 1) * 2));
+       max_batches = pg_prevpower2_size_t(max_batches);
+
        /* Also ensure we avoid integer overflow in nbatch and nbuckets */
        /* (this step is redundant given the current value of MaxAllocSize) */
        max_pointers = Min(max_pointers, INT_MAX / 2 + 1);
@@ -844,6 +856,7 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool 
useskew,
                /* Calculate required number of batches. */
                dbatch = ceil(inner_rel_bytes / (hash_table_bytes - 
bucket_bytes));
                dbatch = Min(dbatch, max_pointers);
+               dbatch = Min(dbatch, max_batches);
                minbatch = (int) dbatch;
                nbatch = pg_nextpower2_32(Max(2, minbatch));
        }
@@ -910,7 +923,8 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool 
useskew,
         * this during the initial sizing - once we start building the hash,
         * nbucket is fixed.
         */
-       while (nbatch > 0)
+       while (nbatch > 0 &&
+                  nbuckets * 2 <= max_pointers) /* prevent allocation limit 
overflow */
        {
                /* how much memory are we using with current nbatch value */
                size_t          current_space = hash_table_bytes + (2 * nbatch 
* BLCKSZ);

Reply via email to