On Wed, Feb 11, 2026 at 02:10:37PM +0700, John Naylor wrote:
> I don't think we need to worry about this, but the attached looks
> nicer to me. There might be a disadvantage of using macros here, but
> I'm not sure what it would be.

The only problem that I see with this is that a compiler that understands
Neon intrinsics but not SVE ones will probably fail because we build
external functions that call the Neon versions in that case.  But that's
easy enough to work around by adding an extra #elif defined(USE_NEON)
section, and it still saves a handful of lines of code.  Perhaps someday
this situation will be rare enough that we can remove that hack and just
point to the portable versions if the compiler doesn't understand both Neon
and SVE, but I'm not sure we're there yet.

-- 
nathan
>From f73b495eddc6d976e15939691057beb596a20c83 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <[email protected]>
Date: Fri, 6 Feb 2026 09:26:54 -0600
Subject: [PATCH v13 1/5] Remove some unnecessary optimizations in popcount
 code.

Over the past few releases, we've added a huge amount of complexity
to our popcount implementations.  Commits fbe327e5b4, 79e232ca01,
8c6653516c, and 25dc485074 did some preliminary refactoring, but
many opportunities remain.  In particular, if we disclaim interest
in micro-optimizing this code for 32-bit builds and in unproven
alignment checks, we can remove a decent chunk of code.

Suggested-by: John Naylor <[email protected]>
Reviewed-by: John Naylor <[email protected]>
Discussion: 
https://postgr.es/m/CANWCAZY7R%2Biy%2Br9YM_sySNydHzNqUirx1xk0tB3ej5HO62GdgQ%40mail.gmail.com
---
 src/include/port/pg_bitutils.h | 16 +-------
 src/port/pg_bitutils.c         | 30 ---------------
 src/port/pg_popcount_x86.c     | 67 ++++++----------------------------
 3 files changed, 14 insertions(+), 99 deletions(-)

diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 35761f509ec..20c11b79c61 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -333,13 +333,7 @@ pg_popcount(const char *buf, int bytes)
         * We set the threshold to the point at which we'll first use special
         * instructions in the optimized version.
         */
-#if SIZEOF_VOID_P >= 8
-       int                     threshold = 8;
-#else
-       int                     threshold = 4;
-#endif
-
-       if (bytes < threshold)
+       if (bytes < 8)
        {
                uint64          popcnt = 0;
 
@@ -364,13 +358,7 @@ pg_popcount_masked(const char *buf, int bytes, bits8 mask)
         * We set the threshold to the point at which we'll first use special
         * instructions in the optimized version.
         */
-#if SIZEOF_VOID_P >= 8
-       int                     threshold = 8;
-#else
-       int                     threshold = 4;
-#endif
-
-       if (bytes < threshold)
+       if (bytes < 8)
        {
                uint64          popcnt = 0;
 
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index ffda75825e5..bec06c06fc3 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -167,20 +167,6 @@ pg_popcount_portable(const char *buf, int bytes)
                        bytes -= 8;
                }
 
-               buf = (const char *) words;
-       }
-#else
-       /* Process in 32-bit chunks if the buffer is aligned. */
-       if (buf == (const char *) TYPEALIGN(4, buf))
-       {
-               const uint32 *words = (const uint32 *) buf;
-
-               while (bytes >= 4)
-               {
-                       popcnt += pg_popcount32_portable(*words++);
-                       bytes -= 4;
-               }
-
                buf = (const char *) words;
        }
 #endif
@@ -215,22 +201,6 @@ pg_popcount_masked_portable(const char *buf, int bytes, 
bits8 mask)
                        bytes -= 8;
                }
 
-               buf = (const char *) words;
-       }
-#else
-       /* Process in 32-bit chunks if the buffer is aligned. */
-       uint32          maskv = ~((uint32) 0) / 0xFF * mask;
-
-       if (buf == (const char *) TYPEALIGN(4, buf))
-       {
-               const uint32 *words = (const uint32 *) buf;
-
-               while (bytes >= 4)
-               {
-                       popcnt += pg_popcount32_portable(*words++ & maskv);
-                       bytes -= 4;
-               }
-
                buf = (const char *) words;
        }
 #endif
diff --git a/src/port/pg_popcount_x86.c b/src/port/pg_popcount_x86.c
index 245f0167d00..7aebf69898b 100644
--- a/src/port/pg_popcount_x86.c
+++ b/src/port/pg_popcount_x86.c
@@ -376,40 +376,20 @@ __asm__ __volatile__(" popcntq 
%1,%0\n":"=q"(res):"rm"(word):"cc");
  * pg_popcount_sse42
  *             Returns the number of 1-bits in buf
  */
+pg_attribute_no_sanitize_alignment()
 static uint64
 pg_popcount_sse42(const char *buf, int bytes)
 {
        uint64          popcnt = 0;
+       const uint64 *words = (const uint64 *) buf;
 
-#if SIZEOF_VOID_P >= 8
-       /* Process in 64-bit chunks if the buffer is aligned. */
-       if (buf == (const char *) TYPEALIGN(8, buf))
+       while (bytes >= 8)
        {
-               const uint64 *words = (const uint64 *) buf;
-
-               while (bytes >= 8)
-               {
-                       popcnt += pg_popcount64_sse42(*words++);
-                       bytes -= 8;
-               }
-
-               buf = (const char *) words;
+               popcnt += pg_popcount64_sse42(*words++);
+               bytes -= 8;
        }
-#else
-       /* Process in 32-bit chunks if the buffer is aligned. */
-       if (buf == (const char *) TYPEALIGN(4, buf))
-       {
-               const uint32 *words = (const uint32 *) buf;
 
-               while (bytes >= 4)
-               {
-                       popcnt += pg_popcount32_sse42(*words++);
-                       bytes -= 4;
-               }
-
-               buf = (const char *) words;
-       }
-#endif
+       buf = (const char *) words;
 
        /* Process any remaining bytes */
        while (bytes--)
@@ -422,44 +402,21 @@ pg_popcount_sse42(const char *buf, int bytes)
  * pg_popcount_masked_sse42
  *             Returns the number of 1-bits in buf after applying the mask to 
each byte
  */
+pg_attribute_no_sanitize_alignment()
 static uint64
 pg_popcount_masked_sse42(const char *buf, int bytes, bits8 mask)
 {
        uint64          popcnt = 0;
-
-#if SIZEOF_VOID_P >= 8
-       /* Process in 64-bit chunks if the buffer is aligned */
        uint64          maskv = ~UINT64CONST(0) / 0xFF * mask;
+       const uint64 *words = (const uint64 *) buf;
 
-       if (buf == (const char *) TYPEALIGN(8, buf))
+       while (bytes >= 8)
        {
-               const uint64 *words = (const uint64 *) buf;
-
-               while (bytes >= 8)
-               {
-                       popcnt += pg_popcount64_sse42(*words++ & maskv);
-                       bytes -= 8;
-               }
-
-               buf = (const char *) words;
+               popcnt += pg_popcount64_sse42(*words++ & maskv);
+               bytes -= 8;
        }
-#else
-       /* Process in 32-bit chunks if the buffer is aligned. */
-       uint32          maskv = ~((uint32) 0) / 0xFF * mask;
-
-       if (buf == (const char *) TYPEALIGN(4, buf))
-       {
-               const uint32 *words = (const uint32 *) buf;
-
-               while (bytes >= 4)
-               {
-                       popcnt += pg_popcount32_sse42(*words++ & maskv);
-                       bytes -= 4;
-               }
 
-               buf = (const char *) words;
-       }
-#endif
+       buf = (const char *) words;
 
        /* Process any remaining bytes */
        while (bytes--)
-- 
2.50.1 (Apple Git-155)

>From d6b07d9573349fc9161f4eb56943e63f77d1c312 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <[email protected]>
Date: Fri, 23 Jan 2026 17:31:20 -0600
Subject: [PATCH v13 2/5] Remove specialized word-length popcount
 implementations.

The uses of these functions do not justify the level of
micro-optimization we've done and may even hurt performance in some
cases (e.g., due to using function pointers).  This commit removes
all architecture-specific implementations of pg_popcount{32,64}()
and converts the portable ones to inlined functions in
pg_bitutils.h.  These inlined versions should produce the same code
as before (but inlined), so in theory this is a net gain for many
machines.

Suggested-by: John Naylor <[email protected]>
Reviewed-by: John Naylor <[email protected]>
Reviewed-by: Greg Burd <[email protected]>
Discussion: 
https://postgr.es/m/CANWCAZY7R%2Biy%2Br9YM_sySNydHzNqUirx1xk0tB3ej5HO62GdgQ%40mail.gmail.com
---
 src/include/port/pg_bitutils.h | 75 +++++++++++++++++++++++-----------
 src/port/pg_bitutils.c         | 65 +----------------------------
 src/port/pg_popcount_aarch64.c | 20 +++------
 src/port/pg_popcount_x86.c     | 43 +------------------
 4 files changed, 59 insertions(+), 144 deletions(-)

diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 20c11b79c61..789663edd93 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -276,46 +276,73 @@ pg_ceil_log2_64(uint64 num)
                return pg_leftmost_one_pos64(num - 1) + 1;
 }
 
-extern int     pg_popcount32_portable(uint32 word);
-extern int     pg_popcount64_portable(uint64 word);
 extern uint64 pg_popcount_portable(const char *buf, int bytes);
 extern uint64 pg_popcount_masked_portable(const char *buf, int bytes, bits8 
mask);
 
-#ifdef HAVE_X86_64_POPCNTQ
+#if defined(HAVE_X86_64_POPCNTQ) || defined(USE_SVE_POPCNT_WITH_RUNTIME_CHECK)
 /*
- * Attempt to use SSE4.2 or AVX-512 instructions, but perform a runtime check
+ * Attempt to use specialized CPU instructions, but perform a runtime check
  * first.
  */
-extern PGDLLIMPORT int (*pg_popcount32) (uint32 word);
-extern PGDLLIMPORT int (*pg_popcount64) (uint64 word);
 extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int 
bytes);
 extern PGDLLIMPORT uint64 (*pg_popcount_masked_optimized) (const char *buf, 
int bytes, bits8 mask);
 
-#elif defined(USE_NEON)
-/* Use the Neon version of pg_popcount{32,64} without function pointer. */
-extern int     pg_popcount32(uint32 word);
-extern int     pg_popcount64(uint64 word);
-
-/*
- * We can try to use an SVE-optimized pg_popcount() on some systems  For that,
- * we do use a function pointer.
- */
-#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
-extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int 
bytes);
-extern PGDLLIMPORT uint64 (*pg_popcount_masked_optimized) (const char *buf, 
int bytes, bits8 mask);
 #else
+/* Use a portable implementation -- no need for a function pointer. */
 extern uint64 pg_popcount_optimized(const char *buf, int bytes);
 extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 
mask);
+
 #endif
 
-#else
-/* Use a portable implementation -- no need for a function pointer. */
-extern int     pg_popcount32(uint32 word);
-extern int     pg_popcount64(uint64 word);
-extern uint64 pg_popcount_optimized(const char *buf, int bytes);
-extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 
mask);
+/*
+ * pg_popcount32
+ *             Return the number of 1 bits set in word
+ */
+static inline int
+pg_popcount32(uint32 word)
+{
+#ifdef HAVE__BUILTIN_POPCOUNT
+       return __builtin_popcount(word);
+#else                                                  /* 
!HAVE__BUILTIN_POPCOUNT */
+       int                     result = 0;
+
+       while (word != 0)
+       {
+               result += pg_number_of_ones[word & 255];
+               word >>= 8;
+       }
 
+       return result;
+#endif                                                 /* 
HAVE__BUILTIN_POPCOUNT */
+}
+
+/*
+ * pg_popcount64
+ *             Return the number of 1 bits set in word
+ */
+static inline int
+pg_popcount64(uint64 word)
+{
+#ifdef HAVE__BUILTIN_POPCOUNT
+#if SIZEOF_LONG == 8
+       return __builtin_popcountl(word);
+#elif SIZEOF_LONG_LONG == 8
+       return __builtin_popcountll(word);
+#else
+#error "cannot find integer of the same size as uint64_t"
 #endif
+#else                                                  /* 
!HAVE__BUILTIN_POPCOUNT */
+       int                     result = 0;
+
+       while (word != 0)
+       {
+               result += pg_number_of_ones[word & 255];
+               word >>= 8;
+       }
+
+       return result;
+#endif                                                 /* 
HAVE__BUILTIN_POPCOUNT */
+}
 
 /*
  * Returns the number of 1-bits in buf.
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index bec06c06fc3..49b130f1306 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -96,56 +96,6 @@ const uint8 pg_number_of_ones[256] = {
        4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
 };
 
-/*
- * pg_popcount32_portable
- *             Return the number of 1 bits set in word
- */
-int
-pg_popcount32_portable(uint32 word)
-{
-#ifdef HAVE__BUILTIN_POPCOUNT
-       return __builtin_popcount(word);
-#else                                                  /* 
!HAVE__BUILTIN_POPCOUNT */
-       int                     result = 0;
-
-       while (word != 0)
-       {
-               result += pg_number_of_ones[word & 255];
-               word >>= 8;
-       }
-
-       return result;
-#endif                                                 /* 
HAVE__BUILTIN_POPCOUNT */
-}
-
-/*
- * pg_popcount64_portable
- *             Return the number of 1 bits set in word
- */
-int
-pg_popcount64_portable(uint64 word)
-{
-#ifdef HAVE__BUILTIN_POPCOUNT
-#if SIZEOF_LONG == 8
-       return __builtin_popcountl(word);
-#elif SIZEOF_LONG_LONG == 8
-       return __builtin_popcountll(word);
-#else
-#error "cannot find integer of the same size as uint64_t"
-#endif
-#else                                                  /* 
!HAVE__BUILTIN_POPCOUNT */
-       int                     result = 0;
-
-       while (word != 0)
-       {
-               result += pg_number_of_ones[word & 255];
-               word >>= 8;
-       }
-
-       return result;
-#endif                                                 /* 
HAVE__BUILTIN_POPCOUNT */
-}
-
 /*
  * pg_popcount_portable
  *             Returns the number of 1-bits in buf
@@ -163,7 +113,7 @@ pg_popcount_portable(const char *buf, int bytes)
 
                while (bytes >= 8)
                {
-                       popcnt += pg_popcount64_portable(*words++);
+                       popcnt += pg_popcount64(*words++);
                        bytes -= 8;
                }
 
@@ -197,7 +147,7 @@ pg_popcount_masked_portable(const char *buf, int bytes, 
bits8 mask)
 
                while (bytes >= 8)
                {
-                       popcnt += pg_popcount64_portable(*words++ & maskv);
+                       popcnt += pg_popcount64(*words++ & maskv);
                        bytes -= 8;
                }
 
@@ -220,17 +170,6 @@ pg_popcount_masked_portable(const char *buf, int bytes, 
bits8 mask)
  * actual external functions.  The compiler should be able to inline the
  * portable versions here.
  */
-int
-pg_popcount32(uint32 word)
-{
-       return pg_popcount32_portable(word);
-}
-
-int
-pg_popcount64(uint64 word)
-{
-       return pg_popcount64_portable(word);
-}
 
 /*
  * pg_popcount_optimized
diff --git a/src/port/pg_popcount_aarch64.c b/src/port/pg_popcount_aarch64.c
index ba57f2cd4bd..f474ef45510 100644
--- a/src/port/pg_popcount_aarch64.c
+++ b/src/port/pg_popcount_aarch64.c
@@ -292,21 +292,11 @@ pg_popcount_masked_optimized(const char *buf, int bytes, 
bits8 mask)
 #endif                                                 /* ! 
USE_SVE_POPCNT_WITH_RUNTIME_CHECK */
 
 /*
- * pg_popcount32
+ * pg_popcount64_neon
  *             Return number of 1 bits in word
  */
-int
-pg_popcount32(uint32 word)
-{
-       return pg_popcount64((uint64) word);
-}
-
-/*
- * pg_popcount64
- *             Return number of 1 bits in word
- */
-int
-pg_popcount64(uint64 word)
+static inline int
+pg_popcount64_neon(uint64 word)
 {
        /*
         * For some compilers, __builtin_popcountl() already emits Neon
@@ -383,7 +373,7 @@ pg_popcount_neon(const char *buf, int bytes)
         */
        for (; bytes >= sizeof(uint64); bytes -= sizeof(uint64))
        {
-               popcnt += pg_popcount64(*((const uint64 *) buf));
+               popcnt += pg_popcount64_neon(*((const uint64 *) buf));
                buf += sizeof(uint64);
        }
 
@@ -465,7 +455,7 @@ pg_popcount_masked_neon(const char *buf, int bytes, bits8 
mask)
         */
        for (; bytes >= sizeof(uint64); bytes -= sizeof(uint64))
        {
-               popcnt += pg_popcount64(*((const uint64 *) buf) & mask64);
+               popcnt += pg_popcount64_neon(*((const uint64 *) buf) & mask64);
                buf += sizeof(uint64);
        }
 
diff --git a/src/port/pg_popcount_x86.c b/src/port/pg_popcount_x86.c
index 7aebf69898b..6bce089432f 100644
--- a/src/port/pg_popcount_x86.c
+++ b/src/port/pg_popcount_x86.c
@@ -36,8 +36,6 @@
  * operation, but in practice this is close enough, and "sse42" seems easier to
  * follow than "popcnt" for these names.
  */
-static inline int pg_popcount32_sse42(uint32 word);
-static inline int pg_popcount64_sse42(uint64 word);
 static uint64 pg_popcount_sse42(const char *buf, int bytes);
 static uint64 pg_popcount_masked_sse42(const char *buf, int bytes, bits8 mask);
 
@@ -55,12 +53,8 @@ static uint64 pg_popcount_masked_avx512(const char *buf, int 
bytes, bits8 mask);
  * what the current CPU supports) and then will call the pointer to fulfill the
  * caller's request.
  */
-static int     pg_popcount32_choose(uint32 word);
-static int     pg_popcount64_choose(uint64 word);
 static uint64 pg_popcount_choose(const char *buf, int bytes);
 static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 
mask);
-int                    (*pg_popcount32) (uint32 word) = pg_popcount32_choose;
-int                    (*pg_popcount64) (uint64 word) = pg_popcount64_choose;
 uint64         (*pg_popcount_optimized) (const char *buf, int bytes) = 
pg_popcount_choose;
 uint64         (*pg_popcount_masked_optimized) (const char *buf, int bytes, 
bits8 mask) = pg_popcount_masked_choose;
 
@@ -157,7 +151,7 @@ pg_popcount_avx512_available(void)
 #endif                                                 /* 
USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
 
 /*
- * These functions get called on the first call to pg_popcount32 etc.
+ * These functions get called on the first call to pg_popcount(), etc.
  * They detect whether we can use the asm implementations, and replace
  * the function pointers so that subsequent calls are routed directly to
  * the chosen implementation.
@@ -167,15 +161,11 @@ choose_popcount_functions(void)
 {
        if (pg_popcount_sse42_available())
        {
-               pg_popcount32 = pg_popcount32_sse42;
-               pg_popcount64 = pg_popcount64_sse42;
                pg_popcount_optimized = pg_popcount_sse42;
                pg_popcount_masked_optimized = pg_popcount_masked_sse42;
        }
        else
        {
-               pg_popcount32 = pg_popcount32_portable;
-               pg_popcount64 = pg_popcount64_portable;
                pg_popcount_optimized = pg_popcount_portable;
                pg_popcount_masked_optimized = pg_popcount_masked_portable;
        }
@@ -189,20 +179,6 @@ choose_popcount_functions(void)
 #endif
 }
 
-static int
-pg_popcount32_choose(uint32 word)
-{
-       choose_popcount_functions();
-       return pg_popcount32(word);
-}
-
-static int
-pg_popcount64_choose(uint64 word)
-{
-       choose_popcount_functions();
-       return pg_popcount64(word);
-}
-
 static uint64
 pg_popcount_choose(const char *buf, int bytes)
 {
@@ -338,23 +314,6 @@ pg_popcount_masked_avx512(const char *buf, int bytes, 
bits8 mask)
 
 #endif                                                 /* 
USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
 
-/*
- * pg_popcount32_sse42
- *             Return the number of 1 bits set in word
- */
-static inline int
-pg_popcount32_sse42(uint32 word)
-{
-#ifdef _MSC_VER
-       return __popcnt(word);
-#else
-       uint32          res;
-
-__asm__ __volatile__(" popcntl %1,%0\n":"=q"(res):"rm"(word):"cc");
-       return (int) res;
-#endif
-}
-
 /*
  * pg_popcount64_sse42
  *             Return the number of 1 bits set in word
-- 
2.50.1 (Apple Git-155)

>From 4044ef13514b68e7172a00347483f9c73901978b Mon Sep 17 00:00:00 2001
From: Nathan Bossart <[email protected]>
Date: Fri, 6 Feb 2026 10:00:21 -0600
Subject: [PATCH v13 3/5] Remove uses of popcount builtins.

This commit replaces the implementations of pg_popcount{32,64} with
branchless ones in plain C.  While these new implementations do not
make use of more sophisticated population count instructions
available on some CPUs, testing indicates they perform well,
especially now that they are inlined.  A follow-up commit will
replace various loops over these functions with calls to
pg_popcount(), leaving us little reason to worry about
micro-optimizing them further.

Since this commit removes the only uses of the popcount builtins,
we can also remove the corresponding configuration checks.

Suggested-by: John Naylor <[email protected]>
Reviewed-by: John Naylor <[email protected]>
Discussion: 
https://postgr.es/m/CANWCAZY7R%2Biy%2Br9YM_sySNydHzNqUirx1xk0tB3ej5HO62GdgQ%40mail.gmail.com
---
 configure                      | 38 ----------------------------
 configure.ac                   |  1 -
 meson.build                    |  1 -
 src/include/pg_config.h.in     |  3 ---
 src/include/port/pg_bitutils.h | 46 +++++++++++-----------------------
 src/port/pg_popcount_aarch64.c |  5 ----
 6 files changed, 14 insertions(+), 80 deletions(-)

diff --git a/configure b/configure
index a10a2c85c6a..8fe368b7201 100755
--- a/configure
+++ b/configure
@@ -15878,44 +15878,6 @@ cat >>confdefs.h <<_ACEOF
 #define HAVE__BUILTIN_CTZ 1
 _ACEOF
 
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_popcount" >&5
-$as_echo_n "checking for __builtin_popcount... " >&6; }
-if ${pgac_cv__builtin_popcount+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-call__builtin_popcount(unsigned int x)
-{
-    return __builtin_popcount(x);
-}
-int
-main ()
-{
-
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv__builtin_popcount=yes
-else
-  pgac_cv__builtin_popcount=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__builtin_popcount" 
>&5
-$as_echo "$pgac_cv__builtin_popcount" >&6; }
-if test x"${pgac_cv__builtin_popcount}" = xyes ; then
-
-cat >>confdefs.h <<_ACEOF
-#define HAVE__BUILTIN_POPCOUNT 1
-_ACEOF
-
 fi
 # __builtin_frame_address may draw a diagnostic for non-constant argument,
 # so it needs a different test function.
diff --git a/configure.ac b/configure.ac
index 814e64a967e..f569b1c3f35 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1852,7 +1852,6 @@ PGAC_CHECK_BUILTIN_FUNC([__builtin_bswap64], [long int x])
 # We assume that we needn't test all widths of these explicitly:
 PGAC_CHECK_BUILTIN_FUNC([__builtin_clz], [unsigned int x])
 PGAC_CHECK_BUILTIN_FUNC([__builtin_ctz], [unsigned int x])
-PGAC_CHECK_BUILTIN_FUNC([__builtin_popcount], [unsigned int x])
 # __builtin_frame_address may draw a diagnostic for non-constant argument,
 # so it needs a different test function.
 PGAC_CHECK_BUILTIN_FUNC_PTR([__builtin_frame_address], [0])
diff --git a/meson.build b/meson.build
index 96b3869df86..c89293cd80f 100644
--- a/meson.build
+++ b/meson.build
@@ -2004,7 +2004,6 @@ builtins = [
   'ctz',
   'constant_p',
   'frame_address',
-  'popcount',
   'unreachable',
 ]
 
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 339268dc8ef..2651b56ae4d 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -526,9 +526,6 @@
 /* Define to 1 if your compiler understands __builtin_$op_overflow. */
 #undef HAVE__BUILTIN_OP_OVERFLOW
 
-/* Define to 1 if your compiler understands __builtin_popcount. */
-#undef HAVE__BUILTIN_POPCOUNT
-
 /* Define to 1 if your compiler understands __builtin_types_compatible_p. */
 #undef HAVE__BUILTIN_TYPES_COMPATIBLE_P
 
diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 789663edd93..c9b1f5f17dc 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -297,51 +297,33 @@ extern uint64 pg_popcount_masked_optimized(const char 
*buf, int bytes, bits8 mas
 /*
  * pg_popcount32
  *             Return the number of 1 bits set in word
+ *
+ * Adapted from
+ * https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel.
  */
 static inline int
 pg_popcount32(uint32 word)
 {
-#ifdef HAVE__BUILTIN_POPCOUNT
-       return __builtin_popcount(word);
-#else                                                  /* 
!HAVE__BUILTIN_POPCOUNT */
-       int                     result = 0;
-
-       while (word != 0)
-       {
-               result += pg_number_of_ones[word & 255];
-               word >>= 8;
-       }
-
-       return result;
-#endif                                                 /* 
HAVE__BUILTIN_POPCOUNT */
+       word -= (word >> 1) & 0x55555555;
+       word = (word & 0x33333333) + ((word >> 2) & 0x33333333);
+       return (((word + (word >> 4)) & 0xf0f0f0f) * 0x1010101) >> 24;
 }
 
 /*
  * pg_popcount64
  *             Return the number of 1 bits set in word
+ *
+ * Adapted from
+ * https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel.
  */
 static inline int
 pg_popcount64(uint64 word)
 {
-#ifdef HAVE__BUILTIN_POPCOUNT
-#if SIZEOF_LONG == 8
-       return __builtin_popcountl(word);
-#elif SIZEOF_LONG_LONG == 8
-       return __builtin_popcountll(word);
-#else
-#error "cannot find integer of the same size as uint64_t"
-#endif
-#else                                                  /* 
!HAVE__BUILTIN_POPCOUNT */
-       int                     result = 0;
-
-       while (word != 0)
-       {
-               result += pg_number_of_ones[word & 255];
-               word >>= 8;
-       }
-
-       return result;
-#endif                                                 /* 
HAVE__BUILTIN_POPCOUNT */
+       word -= (word >> 1) & UINT64CONST(0x5555555555555555);
+       word = (word & UINT64CONST(0x3333333333333333)) +
+               ((word >> 2) & UINT64CONST(0x3333333333333333));
+       word = (word + (word >> 4)) & UINT64CONST(0xf0f0f0f0f0f0f0f);
+       return (word * UINT64CONST(0x101010101010101)) >> 56;
 }
 
 /*
diff --git a/src/port/pg_popcount_aarch64.c b/src/port/pg_popcount_aarch64.c
index f474ef45510..b0f10ae07a4 100644
--- a/src/port/pg_popcount_aarch64.c
+++ b/src/port/pg_popcount_aarch64.c
@@ -298,11 +298,6 @@ pg_popcount_masked_optimized(const char *buf, int bytes, 
bits8 mask)
 static inline int
 pg_popcount64_neon(uint64 word)
 {
-       /*
-        * For some compilers, __builtin_popcountl() already emits Neon
-        * instructions.  The line below should compile to the same code on 
those
-        * systems.
-        */
        return vaddv_u8(vcnt_u8(vld1_u8((const uint8 *) &word)));
 }
 
-- 
2.50.1 (Apple Git-155)

>From d67aced067d8334874da7755425d0a18be8f29a9 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <[email protected]>
Date: Wed, 11 Feb 2026 11:19:35 -0600
Subject: [PATCH v13 4/5] Convert some popcount functions to macros.

Presently, if no special popcount instructions are available,
pg_popcount() and pg_popcount_masked() are implemented as external
functions that call the portable versions.  By converting these to
macros, we can save a handful of lines of code and avoid extra
function calls (if the compiler wasn't previously inlining the
portable versions in the external functions).

Author: John Naylor <[email protected]>
Discussion: 
https://postgr.es/m/CANWCAZZkp2_wP3Dd1p_cVqWyXuJYL7wCMHSQL4p-m-cy0iwHAg%40mail.gmail.com
---
 src/include/port/pg_bitutils.h |  9 +++++++--
 src/port/pg_bitutils.c         | 31 -------------------------------
 2 files changed, 7 insertions(+), 33 deletions(-)

diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index c9b1f5f17dc..11ee206a7b2 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -287,11 +287,16 @@ extern uint64 pg_popcount_masked_portable(const char 
*buf, int bytes, bits8 mask
 extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int 
bytes);
 extern PGDLLIMPORT uint64 (*pg_popcount_masked_optimized) (const char *buf, 
int bytes, bits8 mask);
 
-#else
-/* Use a portable implementation -- no need for a function pointer. */
+#elif defined(USE_NEON)
+/* Use Neon implementations -- no need for a function pointer. */
 extern uint64 pg_popcount_optimized(const char *buf, int bytes);
 extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 
mask);
 
+#else
+/* Use a portable implementation -- no need for a function pointer. */
+#define pg_popcount_optimized(buf, bytes) pg_popcount_portable(buf, bytes)
+#define pg_popcount_masked_optimized(buf, bytes, mask) 
pg_popcount_masked_portable(buf, bytes, mask)
+
 #endif
 
 /*
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 49b130f1306..047044b2917 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -161,34 +161,3 @@ pg_popcount_masked_portable(const char *buf, int bytes, 
bits8 mask)
 
        return popcnt;
 }
-
-#if !defined(HAVE_X86_64_POPCNTQ) && !defined(USE_NEON)
-
-/*
- * When special CPU instructions are not available, there's no point in using
- * function pointers to vary the implementation.  We instead just make these
- * actual external functions.  The compiler should be able to inline the
- * portable versions here.
- */
-
-/*
- * pg_popcount_optimized
- *             Returns the number of 1-bits in buf
- */
-uint64
-pg_popcount_optimized(const char *buf, int bytes)
-{
-       return pg_popcount_portable(buf, bytes);
-}
-
-/*
- * pg_popcount_masked_optimized
- *             Returns the number of 1-bits in buf after applying the mask to 
each byte
- */
-uint64
-pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask)
-{
-       return pg_popcount_masked_portable(buf, bytes, mask);
-}
-
-#endif                                                 /* ! 
HAVE_X86_64_POPCNTQ && ! USE_NEON */
-- 
2.50.1 (Apple Git-155)

>From cea9c45c95ec80cacbc7d60b79e86585301c60a3 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <[email protected]>
Date: Tue, 10 Feb 2026 12:40:08 -0600
Subject: [PATCH v13 5/5] Make use of pg_popcount() in more places.

This replaces some loops over word-length popcount functions with
calls to pg_popcount().  Since pg_popcount() uses a function
pointer for inputs with sizes >= a Bitmapset word, this produces a
small regression for the common one-word case in bms_num_members().
To deal with that, this commit adds an inlined fast-path for that
case.  This fast-path could arguably go in pg_popcount() itself
(with an appropriate alignment check), but that is left for future
study.

Suggested-by: John Naylor <[email protected]>
Reviewed-by: John Naylor <[email protected]>
Discussion: 
https://postgr.es/m/CANWCAZY7R%2Biy%2Br9YM_sySNydHzNqUirx1xk0tB3ej5HO62GdgQ%40mail.gmail.com
---
 src/backend/nodes/bitmapset.c | 29 +++++++----------------------
 src/include/lib/radixtree.h   |  6 +++---
 2 files changed, 10 insertions(+), 25 deletions(-)

diff --git a/src/backend/nodes/bitmapset.c b/src/backend/nodes/bitmapset.c
index a4765876c31..786f343b3c9 100644
--- a/src/backend/nodes/bitmapset.c
+++ b/src/backend/nodes/bitmapset.c
@@ -553,14 +553,8 @@ bms_member_index(Bitmapset *a, int x)
        bitnum = BITNUM(x);
 
        /* count bits in preceding words */
-       for (int i = 0; i < wordnum; i++)
-       {
-               bitmapword      w = a->words[i];
-
-               /* No need to count the bits in a zero word */
-               if (w != 0)
-                       result += bmw_popcount(w);
-       }
+       result += pg_popcount((const char *) a->words,
+                                                 wordnum * sizeof(bitmapword));
 
        /*
         * Now add bits of the last word, but only those before the item. We can
@@ -749,26 +743,17 @@ bms_get_singleton_member(const Bitmapset *a, int *member)
 int
 bms_num_members(const Bitmapset *a)
 {
-       int                     result = 0;
-       int                     nwords;
-       int                     wordnum;
-
        Assert(bms_is_valid_set(a));
 
        if (a == NULL)
                return 0;
 
-       nwords = a->nwords;
-       wordnum = 0;
-       do
-       {
-               bitmapword      w = a->words[wordnum];
+       /* fast-path for common case */
+       if (a->nwords == 1)
+               return bmw_popcount(a->words[0]);
 
-               /* No need to count the bits in a zero word */
-               if (w != 0)
-                       result += bmw_popcount(w);
-       } while (++wordnum < nwords);
-       return result;
+       return pg_popcount((const char *) a->words,
+                                          a->nwords * sizeof(bitmapword));
 }
 
 /*
diff --git a/src/include/lib/radixtree.h b/src/include/lib/radixtree.h
index b223ce10a2d..e6c9a591c17 100644
--- a/src/include/lib/radixtree.h
+++ b/src/include/lib/radixtree.h
@@ -2721,12 +2721,12 @@ RT_VERIFY_NODE(RT_NODE * node)
                case RT_NODE_KIND_256:
                        {
                                RT_NODE_256 *n256 = (RT_NODE_256 *) node;
-                               int                     cnt = 0;
+                               int                     cnt;
 
                                /* RT_DUMP_NODE(node); */
 
-                               for (int i = 0; i < 
RT_BM_IDX(RT_NODE_MAX_SLOTS); i++)
-                                       cnt += bmw_popcount(n256->isset[i]);
+                               cnt = pg_popcount((const char *) n256->isset,
+                                                                 
RT_NODE_MAX_SLOTS / BITS_PER_BYTE);
 
                                /*
                                 * Check if the number of used chunk matches, 
accounting for
-- 
2.50.1 (Apple Git-155)

Reply via email to