Hi,

> Thanks again.
>
> > I think it makes sense to squash 0001 and 0003 together, then 0002 and
> > 0004 together.
> > [...]
> > 0005 doesn't buy us as much in readability since the two lines no longer 
> > match.
>
> Makes sense.
>
> > For the first, we should probably combine in the upper half when using
> > a 64-bit hash, like this:
>
> We could do it if you insist but I'm convinced this is redundant. In a
> good hash upper 32 bits are as evenly distributed as lower ones so
> this combining doesn't buy us much. This may even cause more
> collisions, for values that didn't have them initially.
>
> > Further cleanup possible now that we have 64-bit datums: MAC addresses
> > are always 6 bytes, so abbreviation is no longer relevant -- datum1 is
> > authoritative. That's in scope for the thread subject but also a
> > bigger patch, but maybe someone would like to pick it up for PG20.
>
> I will pick it up and submit as a separate patch a bit later.

0002 had a wrong commit message due to a mistake during squashing.
Here is a corrected patch.

-- 
Best regards,
Aleksander Alekseev
From 1761ccbb3c448a8db88aebf39c4403a5a76b1dec Mon Sep 17 00:00:00 2001
From: Aleksander Alekseev <[email protected]>
Date: Tue, 3 Feb 2026 16:45:10 +0300
Subject: [PATCH v4 2/2] Avoid unnecessary type casting when using hash_any()

hash_any() is merely a wrapper for hash_bytes(). Call it directly when possible
in order to avoid unnecessary type casting.

Additionally, improve the comment for addHyperLogLog().Previously the comment
suggested to use hash_any() which return value is Datum. Since the argument of
addHyperLogLog() is uint32, recommending hash_bytes() is more appropriate.

Author: Aleksander Alekseev <[email protected]>
Suggested-by: John Naylor <[email protected]>
Reviewed-by: John Naylor <[email protected]>
Discussion: https://postgr.es/m/CAJ7c6TMPhDRQMmkUHPv8oOK97B1mR8NRS61DgjpdaZUPAwaeZQ%40mail.gmail.com
---
 contrib/ltree/ltree_op.c                   | 2 +-
 src/backend/access/tablesample/bernoulli.c | 4 ++--
 src/backend/access/tablesample/system.c    | 4 ++--
 src/backend/commands/async.c               | 8 ++++----
 src/backend/lib/hyperloglog.c              | 2 +-
 src/backend/nodes/bitmapset.c              | 4 ++--
 src/backend/tsearch/ts_typanalyze.c        | 4 ++--
 src/backend/utils/adt/bytea.c              | 4 ++--
 src/backend/utils/adt/jsonb_gin.c          | 2 +-
 src/backend/utils/adt/jsonb_util.c         | 4 ++--
 src/backend/utils/adt/varlena.c            | 4 ++--
 src/backend/utils/cache/funccache.c        | 8 ++++----
 12 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/contrib/ltree/ltree_op.c b/contrib/ltree/ltree_op.c
index c1fc77fc804..a9a1bce0861 100644
--- a/contrib/ltree/ltree_op.c
+++ b/contrib/ltree/ltree_op.c
@@ -144,7 +144,7 @@ hash_ltree(PG_FUNCTION_ARGS)
 
 	while (an > 0)
 	{
-		uint32		levelHash = DatumGetUInt32(hash_any((unsigned char *) al->name, al->len));
+		uint32		levelHash = hash_bytes((unsigned char *) al->name, al->len);
 
 		/*
 		 * Combine hash values of successive elements by multiplying the
diff --git a/src/backend/access/tablesample/bernoulli.c b/src/backend/access/tablesample/bernoulli.c
index 7d8560464c8..f768d40143e 100644
--- a/src/backend/access/tablesample/bernoulli.c
+++ b/src/backend/access/tablesample/bernoulli.c
@@ -214,8 +214,8 @@ bernoulli_nextsampletuple(SampleScanState *node,
 
 		hashinput[1] = tupoffset;
 
-		hash = DatumGetUInt32(hash_any((const unsigned char *) hashinput,
-									   (int) sizeof(hashinput)));
+		hash = hash_bytes((const unsigned char *) hashinput,
+						  (int) sizeof(hashinput));
 		if (hash < sampler->cutoff)
 			break;
 	}
diff --git a/src/backend/access/tablesample/system.c b/src/backend/access/tablesample/system.c
index a2b9ba8eea9..de13dd8cab9 100644
--- a/src/backend/access/tablesample/system.c
+++ b/src/backend/access/tablesample/system.c
@@ -202,8 +202,8 @@ system_nextsampleblock(SampleScanState *node, BlockNumber nblocks)
 
 		hashinput[0] = nextblock;
 
-		hash = DatumGetUInt32(hash_any((const unsigned char *) hashinput,
-									   (int) sizeof(hashinput)));
+		hash = hash_bytes((const unsigned char *) hashinput,
+						  (int) sizeof(hashinput));
 		if (hash < sampler->cutoff)
 			break;
 	}
diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c
index 315270ddb11..aee845218e9 100644
--- a/src/backend/commands/async.c
+++ b/src/backend/commands/async.c
@@ -652,8 +652,8 @@ globalChannelTableHash(const void *key, size_t size, void *arg)
 	dshash_hash h;
 
 	h = murmurhash32(k->dboid);
-	h ^= hash_any((const unsigned char *) k->channel,
-				  strnlen(k->channel, NAMEDATALEN));
+	h ^= hash_bytes((const unsigned char *) k->channel,
+				    strnlen(k->channel, NAMEDATALEN));
 
 	return h;
 }
@@ -3244,8 +3244,8 @@ notification_hash(const void *key, Size keysize)
 
 	Assert(keysize == sizeof(Notification *));
 	/* We don't bother to include the payload's trailing null in the hash */
-	return DatumGetUInt32(hash_any((const unsigned char *) k->data,
-								   k->channel_len + k->payload_len + 1));
+	return hash_bytes((const unsigned char *) k->data,
+					  k->channel_len + k->payload_len + 1);
 }
 
 /*
diff --git a/src/backend/lib/hyperloglog.c b/src/backend/lib/hyperloglog.c
index c74f11217ef..2b82ff12b2e 100644
--- a/src/backend/lib/hyperloglog.c
+++ b/src/backend/lib/hyperloglog.c
@@ -158,7 +158,7 @@ freeHyperLogLog(hyperLogLogState *cState)
  * Adds element to the estimator, from caller-supplied hash.
  *
  * It is critical that the hash value passed be an actual hash value, typically
- * generated using hash_any().  The algorithm relies on a specific bit-pattern
+ * generated using hash_bytes().  The algorithm relies on a specific bit-pattern
  * observable in conjunction with stochastic averaging.  There must be a
  * uniform distribution of bits in hash values for each distinct original value
  * observed.
diff --git a/src/backend/nodes/bitmapset.c b/src/backend/nodes/bitmapset.c
index 786f343b3c9..68bb936898b 100644
--- a/src/backend/nodes/bitmapset.c
+++ b/src/backend/nodes/bitmapset.c
@@ -1405,8 +1405,8 @@ bms_hash_value(const Bitmapset *a)
 
 	if (a == NULL)
 		return 0;				/* All empty sets hash to 0 */
-	return DatumGetUInt32(hash_any((const unsigned char *) a->words,
-								   a->nwords * sizeof(bitmapword)));
+	return hash_bytes((const unsigned char *) a->words,
+					  a->nwords * sizeof(bitmapword));
 }
 
 /*
diff --git a/src/backend/tsearch/ts_typanalyze.c b/src/backend/tsearch/ts_typanalyze.c
index 48ee050e37f..6ca8ead3988 100644
--- a/src/backend/tsearch/ts_typanalyze.c
+++ b/src/backend/tsearch/ts_typanalyze.c
@@ -496,8 +496,8 @@ lexeme_hash(const void *key, Size keysize)
 {
 	const LexemeHashKey *l = (const LexemeHashKey *) key;
 
-	return DatumGetUInt32(hash_any((const unsigned char *) l->lexeme,
-								   l->length));
+	return hash_bytes((const unsigned char *) l->lexeme,
+					  l->length);
 }
 
 /*
diff --git a/src/backend/utils/adt/bytea.c b/src/backend/utils/adt/bytea.c
index f32bd23d03c..1e255f00570 100644
--- a/src/backend/utils/adt/bytea.c
+++ b/src/backend/utils/adt/bytea.c
@@ -1106,8 +1106,8 @@ bytea_abbrev_convert(Datum original, SortSupport ssup)
 	 * in order to compensate for cases where differences are past
 	 * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
 	 */
-	hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
-								   Min(len, PG_CACHE_LINE_SIZE)));
+	hash = hash_bytes((unsigned char *) authoritative_data,
+					  Min(len, PG_CACHE_LINE_SIZE));
 
 	if (len > PG_CACHE_LINE_SIZE)
 		hash ^= murmurhash32((uint32) len);
diff --git a/src/backend/utils/adt/jsonb_gin.c b/src/backend/utils/adt/jsonb_gin.c
index d72a6441c5e..f5dbd5589d3 100644
--- a/src/backend/utils/adt/jsonb_gin.c
+++ b/src/backend/utils/adt/jsonb_gin.c
@@ -1333,7 +1333,7 @@ make_text_key(char flag, const char *str, int len)
 	{
 		uint32		hashval;
 
-		hashval = DatumGetUInt32(hash_any((const unsigned char *) str, len));
+		hashval = hash_bytes((const unsigned char *) str, len);
 		snprintf(hashbuf, sizeof(hashbuf), "%08x", hashval);
 		str = hashbuf;
 		len = 8;
diff --git a/src/backend/utils/adt/jsonb_util.c b/src/backend/utils/adt/jsonb_util.c
index 91fb9ea09bf..91bff9fda3a 100644
--- a/src/backend/utils/adt/jsonb_util.c
+++ b/src/backend/utils/adt/jsonb_util.c
@@ -1451,8 +1451,8 @@ JsonbHashScalarValue(const JsonbValue *scalarVal, uint32 *hash)
 			tmp = 0x01;
 			break;
 		case jbvString:
-			tmp = DatumGetUInt32(hash_any((const unsigned char *) scalarVal->val.string.val,
-										  scalarVal->val.string.len));
+			tmp = hash_bytes((const unsigned char *) scalarVal->val.string.val,
+							 scalarVal->val.string.len);
 			break;
 		case jbvNumeric:
 			/* Must hash equal numerics to equal hash codes */
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index ea3d4dcf18b..30545904101 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -2151,8 +2151,8 @@ varstr_abbrev_convert(Datum original, SortSupport ssup)
 	 * in order to compensate for cases where differences are past
 	 * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
 	 */
-	hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
-								   Min(len, PG_CACHE_LINE_SIZE)));
+	hash = hash_bytes((unsigned char *) authoritative_data,
+					  Min(len, PG_CACHE_LINE_SIZE));
 
 	if (len > PG_CACHE_LINE_SIZE)
 		hash ^= murmurhash32((uint32) len);
diff --git a/src/backend/utils/cache/funccache.c b/src/backend/utils/cache/funccache.c
index 701c294b88d..68bbf0b971c 100644
--- a/src/backend/utils/cache/funccache.c
+++ b/src/backend/utils/cache/funccache.c
@@ -89,13 +89,13 @@ cfunc_hash(const void *key, Size keysize)
 
 	Assert(keysize == sizeof(CachedFunctionHashKey));
 	/* Hash all the fixed fields except callResultType */
-	h = DatumGetUInt32(hash_any((const unsigned char *) k,
-								offsetof(CachedFunctionHashKey, callResultType)));
+	h = hash_bytes((const unsigned char *) k,
+				   offsetof(CachedFunctionHashKey, callResultType));
 	/* Incorporate input argument types */
 	if (k->nargs > 0)
 		h = hash_combine(h,
-						 DatumGetUInt32(hash_any((const unsigned char *) k->argtypes,
-												 k->nargs * sizeof(Oid))));
+						 hash_bytes((const unsigned char *) k->argtypes,
+									k->nargs * sizeof(Oid)));
 	/* Incorporate callResultType if present */
 	if (k->callResultType)
 		h = hash_combine(h, hashRowType(k->callResultType));
-- 
2.43.0

From 3cd625d2d42554111b34d70d0820f68c7ec5db11 Mon Sep 17 00:00:00 2001
From: Aleksander Alekseev <[email protected]>
Date: Tue, 13 Jan 2026 14:51:21 +0300
Subject: [PATCH v4 1/2] Simplify abbreviated key hashing using murmurhash64()

Now when all Datums are 64-bit values we can simplify the code by using
murmurhash64() in *_abbrev_convert() functions. Also replace hash_uint32()
with murmurhash32() in a few other places for consistency.

Author: Aleksander Alekseev <[email protected]>
Suggested-by: John Naylor <[email protected]>
Reviewed-by: John Naylor <[email protected]>
Discussion: https://postgr.es/m/CAJ7c6TMPhDRQMmkUHPv8oOK97B1mR8NRS61DgjpdaZUPAwaeZQ%40mail.gmail.com
---
 src/backend/commands/async.c    |  6 +++---
 src/backend/utils/adt/bytea.c   |  9 ++-------
 src/backend/utils/adt/mac.c     | 11 +++--------
 src/backend/utils/adt/network.c |  6 +-----
 src/backend/utils/adt/numeric.c |  5 +----
 src/backend/utils/adt/uuid.c    |  6 +-----
 src/backend/utils/adt/varlena.c |  9 ++-------
 7 files changed, 13 insertions(+), 39 deletions(-)

diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c
index 657c591618d..315270ddb11 100644
--- a/src/backend/commands/async.c
+++ b/src/backend/commands/async.c
@@ -651,9 +651,9 @@ globalChannelTableHash(const void *key, size_t size, void *arg)
 	const GlobalChannelKey *k = (const GlobalChannelKey *) key;
 	dshash_hash h;
 
-	h = DatumGetUInt32(hash_uint32(k->dboid));
-	h ^= DatumGetUInt32(hash_any((const unsigned char *) k->channel,
-								 strnlen(k->channel, NAMEDATALEN)));
+	h = murmurhash32(k->dboid);
+	h ^= hash_any((const unsigned char *) k->channel,
+				  strnlen(k->channel, NAMEDATALEN));
 
 	return h;
 }
diff --git a/src/backend/utils/adt/bytea.c b/src/backend/utils/adt/bytea.c
index fd7662d41ee..f32bd23d03c 100644
--- a/src/backend/utils/adt/bytea.c
+++ b/src/backend/utils/adt/bytea.c
@@ -1110,17 +1110,12 @@ bytea_abbrev_convert(Datum original, SortSupport ssup)
 								   Min(len, PG_CACHE_LINE_SIZE)));
 
 	if (len > PG_CACHE_LINE_SIZE)
-		hash ^= DatumGetUInt32(hash_uint32((uint32) len));
+		hash ^= murmurhash32((uint32) len);
 
 	addHyperLogLog(&bss->full_card, hash);
 
 	/* Hash abbreviated key */
-	{
-		uint32		tmp;
-
-		tmp = DatumGetUInt32(res) ^ (uint32) (DatumGetUInt64(res) >> 32);
-		hash = DatumGetUInt32(hash_uint32(tmp));
-	}
+	hash = (uint32) murmurhash64(DatumGetUInt64(res));
 
 	addHyperLogLog(&bss->abbr_card, hash);
 
diff --git a/src/backend/utils/adt/mac.c b/src/backend/utils/adt/mac.c
index f14675dea40..0658846f274 100644
--- a/src/backend/utils/adt/mac.c
+++ b/src/backend/utils/adt/mac.c
@@ -492,17 +492,12 @@ macaddr_abbrev_convert(Datum original, SortSupport ssup)
 	uss->input_count += 1;
 
 	/*
-	 * Cardinality estimation. The estimate uses uint32, so XOR the two 32-bit
-	 * halves together to produce slightly more entropy. The two zeroed bytes
-	 * won't have any practical impact on this operation.
+	 * Cardinality estimation. The estimate uses uint32, so we hash the full
+	 * 64-bit value and take the lower 32 bits of the result.
 	 */
 	if (uss->estimating)
 	{
-		uint32		tmp;
-
-		tmp = DatumGetUInt32(res) ^ (uint32) (DatumGetUInt64(res) >> 32);
-
-		addHyperLogLog(&uss->abbr_card, DatumGetUInt32(hash_uint32(tmp)));
+		addHyperLogLog(&uss->abbr_card, (uint32) murmurhash64(DatumGetUInt64(res)));
 	}
 
 	/*
diff --git a/src/backend/utils/adt/network.c b/src/backend/utils/adt/network.c
index 3a2002097dd..c226af5ca80 100644
--- a/src/backend/utils/adt/network.c
+++ b/src/backend/utils/adt/network.c
@@ -739,11 +739,7 @@ network_abbrev_convert(Datum original, SortSupport ssup)
 	/* Hash abbreviated key */
 	if (uss->estimating)
 	{
-		uint32		tmp;
-
-		tmp = DatumGetUInt32(res) ^ (uint32) (DatumGetUInt64(res) >> 32);
-
-		addHyperLogLog(&uss->abbr_card, DatumGetUInt32(hash_uint32(tmp)));
+		addHyperLogLog(&uss->abbr_card, (uint32) murmurhash64(DatumGetUInt64(res)));
 	}
 
 	return res;
diff --git a/src/backend/utils/adt/numeric.c b/src/backend/utils/adt/numeric.c
index d25b8ad505d..bbe2581f0b7 100644
--- a/src/backend/utils/adt/numeric.c
+++ b/src/backend/utils/adt/numeric.c
@@ -2397,10 +2397,7 @@ numeric_abbrev_convert_var(const NumericVar *var, NumericSortSupport *nss)
 
 	if (nss->estimating)
 	{
-		uint32		tmp = ((uint32) result
-						   ^ (uint32) ((uint64) result >> 32));
-
-		addHyperLogLog(&nss->abbr_card, DatumGetUInt32(hash_uint32(tmp)));
+		addHyperLogLog(&nss->abbr_card, (uint32) murmurhash64(result));
 	}
 
 	return NumericAbbrevGetDatum(result);
diff --git a/src/backend/utils/adt/uuid.c b/src/backend/utils/adt/uuid.c
index 6ee3752ac78..888802c3012 100644
--- a/src/backend/utils/adt/uuid.c
+++ b/src/backend/utils/adt/uuid.c
@@ -396,11 +396,7 @@ uuid_abbrev_convert(Datum original, SortSupport ssup)
 
 	if (uss->estimating)
 	{
-		uint32		tmp;
-
-		tmp = DatumGetUInt32(res) ^ (uint32) (DatumGetUInt64(res) >> 32);
-
-		addHyperLogLog(&uss->abbr_card, DatumGetUInt32(hash_uint32(tmp)));
+		addHyperLogLog(&uss->abbr_card, (uint32) murmurhash64(DatumGetUInt64(res)));
 	}
 
 	/*
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 7caf700fd61..ea3d4dcf18b 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -2155,17 +2155,12 @@ varstr_abbrev_convert(Datum original, SortSupport ssup)
 								   Min(len, PG_CACHE_LINE_SIZE)));
 
 	if (len > PG_CACHE_LINE_SIZE)
-		hash ^= DatumGetUInt32(hash_uint32((uint32) len));
+		hash ^= murmurhash32((uint32) len);
 
 	addHyperLogLog(&sss->full_card, hash);
 
 	/* Hash abbreviated key */
-	{
-		uint32		tmp;
-
-		tmp = DatumGetUInt32(res) ^ (uint32) (DatumGetUInt64(res) >> 32);
-		hash = DatumGetUInt32(hash_uint32(tmp));
-	}
+	hash = (uint32) murmurhash64(DatumGetUInt64(res));
 
 	addHyperLogLog(&sss->abbr_card, hash);
 
-- 
2.43.0

Reply via email to