From 8d54057e0a0901c99b4d06eeb4c3bc2e8dad8f19 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Wed, 11 Aug 2021 21:37:00 -0400
Subject: [PATCH v2 2/2] Replace intrinsics in pg_popcount{32,64}_slow with
 pure C code

Intrinsics are used in the hope that the compiler will access some fast
hardware implementation where available. However, on x86 at least,
__builtin_popcount() didn't emit a POPCNT instruction since -mpopcnt
wasn't passed to the compiler. Instead, the compiler emitted bitwise
operations where the intrinsic was supported. Where not supported,
we used a byte-at-a-time loop using a lookup table.

Since the *_slow functions are fallback implementations, replace the
intrinsics and the associated #ifdef maze with the bitwise operations
written in C so all platforms can benefit from them.

If we ever get configure support for x86-64-v2, we could use these
intrinsics to emit the POPCNT instruction without a runtime check. To
allow for that possibility, let's keep the configure checks around.
---
 src/port/pg_bitutils.c | 47 ++++++++++++++----------------------------
 1 file changed, 15 insertions(+), 32 deletions(-)

diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index b5a62def6f..0d6a0cd5d8 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -207,6 +207,11 @@ __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc");
 #endif							/* TRY_POPCNT_FAST */
 
 
+/*
+ * The *_slow implementations are based on
+ * http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+ */
+
 /*
  * pg_popcount32_slow
  *		Return the number of 1 bits set in word
@@ -214,19 +219,11 @@ __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc");
 int
 pg_popcount32_slow(uint32 word)
 {
-#ifdef HAVE__BUILTIN_POPCOUNT
-	return __builtin_popcount(word);
-#else							/* !HAVE__BUILTIN_POPCOUNT */
-	int			result = 0;
-
-	while (word != 0)
-	{
-		result += pg_number_of_ones[word & 255];
-		word >>= 8;
-	}
-
-	return result;
-#endif							/* HAVE__BUILTIN_POPCOUNT */
+	word = word - ((word >> 1) & 0x55555555);
+	word = (word & 0x33333333) +
+		((word >> 2) & 0x33333333);
+	word = (word + (word >> 4)) & 0xF0F0F0F;
+	return (int) ((word * 0x1010101) >> 24);
 }
 
 /*
@@ -236,25 +233,11 @@ pg_popcount32_slow(uint32 word)
 int
 pg_popcount64_slow(uint64 word)
 {
-#ifdef HAVE__BUILTIN_POPCOUNT
-#if defined(HAVE_LONG_INT_64)
-	return __builtin_popcountl(word);
-#elif defined(HAVE_LONG_LONG_INT_64)
-	return __builtin_popcountll(word);
-#else
-#error must have a working 64-bit integer datatype
-#endif
-#else							/* !HAVE__BUILTIN_POPCOUNT */
-	int			result = 0;
-
-	while (word != 0)
-	{
-		result += pg_number_of_ones[word & 255];
-		word >>= 8;
-	}
-
-	return result;
-#endif							/* HAVE__BUILTIN_POPCOUNT */
+	word = word - ((word >> 1) & UINT64CONST(0x5555555555555555));
+	word = (word & UINT64CONST(0x3333333333333333)) +
+		((word >> 2) & UINT64CONST(0x3333333333333333));
+	word = (word + (word >> 4)) & UINT64CONST(0xF0F0F0F0F0F0F0F);
+	return (int) ((word * UINT64CONST(0x101010101010101)) >> 56);
 }
 
 #ifndef TRY_POPCNT_FAST
-- 
2.31.1

