On Fri, Sep 05, 2025 at 10:40:46AM +0100, Dean Rasheed wrote:
> Alternatively, why not just impose the upper bound at the call sites
> if needed, so there may be no need for these functions at all. For
> example, looking at nodeHash.c, it would seem much more logical to
> have ExecChooseHashTableSize() put an upper bound on nbuckets, rather
> than capping log2_nbuckets after nbuckets has been chosen, risking
> them getting out-of-sync.

Yep, that may be the best course of action.  As far as I can see, this
is capped by palloc() and HashJoinTuple, so we should be OK with
putting a hard limit at (INT_MAX / 2) and call it a day, I guess?

The two other call sites of my_log2() are worker.c, for the number of
subxacts, which relies on int32.  The other call site is nodeAgg.c,
capped at HASHAGG_MAX_PARTITIONS (1024).

As of the attached, dynahash.h can be removed, which is the minimal
goal I had in mind.  I am not sure about the need to tweak more
dynahash.c, as we've relied on long in this file for many years.  We
could bite the bullet and do it, of course, but I am not sure..  So
I would be happy with only the attached changes.

What do you think?
--
Michael
From 5c81cf6b035368ac2dc9c4b20881a9e13eff5360 Mon Sep 17 00:00:00 2001
From: Michael Paquier <mich...@paquier.xyz>
Date: Tue, 9 Sep 2025 12:40:04 +0900
Subject: [PATCH v3 1/2] Replace callers of dynahash.h's my_log() by equivalent
 in pg_bitutils.h

All these callers use 4-byte signed integers for their variables, hence
they do not require my_log2() maximum based on int64.  This eases an
upcoming removal of my_log2().

ExecChooseHashTableSize(), that calculates the number of buckets to use
for hash tables, is now capped at 2^30, to keep pg_ceil_log2_32() on the
same side.  nodeAgg.c is already capped at 1024, and worker.c relies on
the maximum number of subxacts.
---
 src/backend/executor/nodeAgg.c           |  3 +--
 src/backend/executor/nodeHash.c          | 14 ++++++++++----
 src/backend/replication/logical/worker.c |  3 +--
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index 377e016d7322..a4f3d30f307c 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -267,7 +267,6 @@
 #include "utils/acl.h"
 #include "utils/builtins.h"
 #include "utils/datum.h"
-#include "utils/dynahash.h"
 #include "utils/expandeddatum.h"
 #include "utils/injection_point.h"
 #include "utils/logtape.h"
@@ -2115,7 +2114,7 @@ hash_choose_num_partitions(double input_groups, double hashentrysize,
 	npartitions = (int) dpartitions;
 
 	/* ceil(log2(npartitions)) */
-	partition_bits = my_log2(npartitions);
+	partition_bits = pg_ceil_log2_32(npartitions);
 
 	/* make sure that we don't exhaust the hash bits */
 	if (partition_bits + used_bits >= 32)
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 8d2201ab67fa..db8f4722ebb2 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -36,7 +36,6 @@
 #include "executor/nodeHashjoin.h"
 #include "miscadmin.h"
 #include "port/pg_bitutils.h"
-#include "utils/dynahash.h"
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
 #include "utils/syscache.h"
@@ -340,7 +339,7 @@ MultiExecParallelHash(HashState *node)
 	 */
 	hashtable->curbatch = -1;
 	hashtable->nbuckets = pstate->nbuckets;
-	hashtable->log2_nbuckets = my_log2(hashtable->nbuckets);
+	hashtable->log2_nbuckets = pg_ceil_log2_32(hashtable->nbuckets);
 	hashtable->totalTuples = pstate->total_tuples;
 
 	/*
@@ -480,7 +479,7 @@ ExecHashTableCreate(HashState *state)
 							&nbuckets, &nbatch, &num_skew_mcvs);
 
 	/* nbuckets must be a power of 2 */
-	log2_nbuckets = my_log2(nbuckets);
+	log2_nbuckets = pg_ceil_log2_32(nbuckets);
 	Assert(nbuckets == (1 << log2_nbuckets));
 
 	/*
@@ -935,6 +934,13 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 	Assert(nbuckets > 0);
 	Assert(nbatch > 0);
 
+	/*
+	 * Cap nbuckets, for power of 2 calculations.  This maximum is safe
+	 * for pg_ceil_log2_32().
+	 */
+	if (nbuckets > PG_INT32_MAX / 2)
+		nbuckets = PG_INT32_MAX / 2;
+
 	*numbuckets = nbuckets;
 	*numbatches = nbatch;
 }
@@ -3499,7 +3505,7 @@ ExecParallelHashTableSetCurrentBatch(HashJoinTable hashtable, int batchno)
 		dsa_get_address(hashtable->area,
 						hashtable->batches[batchno].shared->buckets);
 	hashtable->nbuckets = hashtable->parallel_state->nbuckets;
-	hashtable->log2_nbuckets = my_log2(hashtable->nbuckets);
+	hashtable->log2_nbuckets = pg_ceil_log2_32(hashtable->nbuckets);
 	hashtable->current_chunk = NULL;
 	hashtable->current_chunk_shared = InvalidDsaPointer;
 	hashtable->batches[batchno].at_least_one_chunk = false;
diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c
index c0f6bef5c282..c6d7c1fcde71 100644
--- a/src/backend/replication/logical/worker.c
+++ b/src/backend/replication/logical/worker.c
@@ -276,7 +276,6 @@
 #include "storage/procarray.h"
 #include "tcop/tcopprot.h"
 #include "utils/acl.h"
-#include "utils/dynahash.h"
 #include "utils/guc.h"
 #include "utils/inval.h"
 #include "utils/lsyscache.h"
@@ -5109,7 +5108,7 @@ subxact_info_read(Oid subid, TransactionId xid)
 	len = sizeof(SubXactInfo) * subxact_data.nsubxacts;
 
 	/* we keep the maximum as a power of 2 */
-	subxact_data.nsubxacts_max = 1 << my_log2(subxact_data.nsubxacts);
+	subxact_data.nsubxacts_max = 1 << pg_ceil_log2_32(subxact_data.nsubxacts);
 
 	/*
 	 * Allocate subxact information in the logical streaming context. We need
-- 
2.51.0

From c6a492debc0d4c74c26f48db18aeeb6d4db8a088 Mon Sep 17 00:00:00 2001
From: Michael Paquier <mich...@paquier.xyz>
Date: Tue, 9 Sep 2025 12:42:48 +0900
Subject: [PATCH v3 2/2] Remove dynahash.h

All the callers of my_log2() are now limited to dynahash.c, so let's
remove its header.  This capability is provided by pg_bitutils.h
already.
---
 src/include/utils/dynahash.h      | 20 --------------------
 src/backend/utils/hash/dynahash.c |  4 ++--
 2 files changed, 2 insertions(+), 22 deletions(-)
 delete mode 100644 src/include/utils/dynahash.h

diff --git a/src/include/utils/dynahash.h b/src/include/utils/dynahash.h
deleted file mode 100644
index a4362d3f65e5..000000000000
--- a/src/include/utils/dynahash.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * dynahash.h
- *	  POSTGRES dynahash.h file definitions
- *
- *
- * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
- * Portions Copyright (c) 1994, Regents of the University of California
- *
- * IDENTIFICATION
- *		src/include/utils/dynahash.h
- *
- *-------------------------------------------------------------------------
- */
-#ifndef DYNAHASH_H
-#define DYNAHASH_H
-
-extern int	my_log2(int64 num);
-
-#endif							/* DYNAHASH_H */
diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c
index 1aeee5be42ac..ac94b9e93c6e 100644
--- a/src/backend/utils/hash/dynahash.c
+++ b/src/backend/utils/hash/dynahash.c
@@ -102,7 +102,6 @@
 #include "port/pg_bitutils.h"
 #include "storage/shmem.h"
 #include "storage/spin.h"
-#include "utils/dynahash.h"
 #include "utils/memutils.h"
 
 
@@ -281,6 +280,7 @@ static bool init_htab(HTAB *hashp, int64 nelem);
 pg_noreturn static void hash_corrupted(HTAB *hashp);
 static uint32 hash_initial_lookup(HTAB *hashp, uint32 hashvalue,
 								  HASHBUCKET **bucketptr);
+static int	my_log2(int64 num);
 static int64 next_pow2_int64(int64 num);
 static int	next_pow2_int(int64 num);
 static void register_seq_scan(HTAB *hashp);
@@ -1813,7 +1813,7 @@ hash_corrupted(HTAB *hashp)
 }
 
 /* calculate ceil(log base 2) of num */
-int
+static int
 my_log2(int64 num)
 {
 	/*
-- 
2.51.0

Attachment: signature.asc
Description: PGP signature

Reply via email to