On Fri, Feb 20, 2026 at 09:39:38AM -0600, Nathan Bossart wrote:
> I spent some time looking at how clang/gcc compiled the plain-C version on
> various architectures [0], and I was pleasantly surprised to discover that
> at some point in recent history they started automatically converting it to
> special popcount instructions.  I suspect that you'd see better results on
> ppc64le if you upgraded the compiler...

If we're willing to rely on this behavior, we could even remove
pg_popcount64_neon() and pg_popcount64_sse42().  We still need to add
"pg_attribute_target("popcnt")" and a corresponding configure check for the
x86 stuff, so it's not as impressive from a code-removal standpoint, but it
at least allows us to remove some uses of intrinsics and inline assembly.

-- 
nathan
>From f9b62118024e018e50d58c99718d9500ab4c5f8a Mon Sep 17 00:00:00 2001
From: Nathan Bossart <[email protected]>
Date: Fri, 20 Feb 2026 14:33:27 -0600
Subject: [PATCH v15 1/2] Remove uses of popcount builtins.

This commit replaces the implementations of pg_popcount{32,64} with
branchless ones in plain C.  Newer versions of popular compilers
will automatically replace these with more sophisticated population
count instructions if possible, leaving us little reason to
continue using the builtins.  This also allows us to remove the
remaining architecture-specific implementations of pg_popcount64(),
which were only used by the corresponding architecture-specific
implementations of pg_popcount() and pg_popcount_masked().  Since
this commit removes the only uses of the popcount builtins, we can
remove the corresponding configuration checks, too.

Suggested-by: John Naylor <[email protected]>
Reviewed-by: John Naylor <[email protected]>
Discussion: 
https://postgr.es/m/CANWCAZY7R%2Biy%2Br9YM_sySNydHzNqUirx1xk0tB3ej5HO62GdgQ%40mail.gmail.com
---
 config/c-compiler.m4           |  26 +++++++
 configure                      | 119 +++++++++++++--------------------
 configure.ac                   |  23 +++----
 meson.build                    |  43 +++++++-----
 src/include/pg_config.h.in     |   7 +-
 src/include/port/pg_bitutils.h |  56 +++++++---------
 src/port/pg_bitutils.c         |   4 +-
 src/port/pg_popcount_aarch64.c |  19 +-----
 src/port/pg_popcount_x86.c     |  27 ++------
 9 files changed, 143 insertions(+), 181 deletions(-)

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 1509dbfa2ab..93fd6b3b617 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -743,6 +743,32 @@ fi
 undefine([Ac_cachevar])dnl
 ])# PGAC_XSAVE_INTRINSICS
 
+# PGAC_X86_POPCNT_INTRINSICS
+# ----------------------
+# Check if the compiler supports the x86 POPCNT instructions, using the
+# _popcnt64 intrinsic function.
+#
+# If the instrinsic is supported, sets pgac_x86_popcnt_intrinsics
+AC_DEFUN([PGAC_X86_POPCNT_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_x86_popcnt_intrinsics])])dnl
+AC_CACHE_CHECK([for _popcnt64], [Ac_cachevar],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>
+    #if defined(__has_attribute) && __has_attribute (target)
+    __attribute__((target("popcnt")))
+    #endif
+    static int x86_popcnt_test(void)
+    {
+      return _popcnt64(0);
+    }],
+  [return x86_popcnt_test();])],
+  [Ac_cachevar=yes],
+  [Ac_cachevar=no])])
+if test x"$Ac_cachevar" = x"yes"; then
+  pgac_x86_popcnt_intrinsics=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_X86_POPCNT_INTRINSICS
+
 # PGAC_AVX512_POPCNT_INTRINSICS
 # -----------------------------
 # Check if the compiler supports the AVX-512 popcount instructions using the
diff --git a/configure b/configure
index e1a08129974..a37b8d9676c 100755
--- a/configure
+++ b/configure
@@ -15256,40 +15256,6 @@ fi
 
 
 case $host_cpu in
-  x86_64)
-    # On x86_64, check if we can compile a popcntq instruction
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether assembler 
supports x86_64 popcntq" >&5
-$as_echo_n "checking whether assembler supports x86_64 popcntq... " >&6; }
-if ${pgac_cv_have_x86_64_popcntq+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main ()
-{
-long long x = 1; long long r;
-        __asm__ __volatile__ (" popcntq %1,%0\n" : "=q"(r) : "rm"(x));
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
-  pgac_cv_have_x86_64_popcntq=yes
-else
-  pgac_cv_have_x86_64_popcntq=no
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_have_x86_64_popcntq" 
>&5
-$as_echo "$pgac_cv_have_x86_64_popcntq" >&6; }
-    if test x"$pgac_cv_have_x86_64_popcntq" = xyes ; then
-
-$as_echo "#define HAVE_X86_64_POPCNTQ 1" >>confdefs.h
-
-    fi
-  ;;
   ppc*|powerpc*)
     # On PPC, check if compiler accepts "i"(x) when __builtin_constant_p(x).
     { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether 
__builtin_constant_p(x) implies \"i\"(x) acceptance" >&5
@@ -15836,44 +15802,6 @@ cat >>confdefs.h <<_ACEOF
 #define HAVE__BUILTIN_CTZ 1
 _ACEOF
 
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_popcount" >&5
-$as_echo_n "checking for __builtin_popcount... " >&6; }
-if ${pgac_cv__builtin_popcount+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-call__builtin_popcount(unsigned int x)
-{
-    return __builtin_popcount(x);
-}
-int
-main ()
-{
-
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv__builtin_popcount=yes
-else
-  pgac_cv__builtin_popcount=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__builtin_popcount" 
>&5
-$as_echo "$pgac_cv__builtin_popcount" >&6; }
-if test x"${pgac_cv__builtin_popcount}" = xyes ; then
-
-cat >>confdefs.h <<_ACEOF
-#define HAVE__BUILTIN_POPCOUNT 1
-_ACEOF
-
 fi
 # __builtin_frame_address may draw a diagnostic for non-constant argument,
 # so it needs a different test function.
@@ -17721,6 +17649,53 @@ $as_echo "#define HAVE_XSAVE_INTRINSICS 1" >>confdefs.h
 
 fi
 
+# Check for x86 POPCNT intrinsics
+#
+if test x"$host_cpu" = x"x86_64"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _popcnt64" >&5
+$as_echo_n "checking for _popcnt64... " >&6; }
+if ${pgac_cv_x86_popcnt_intrinsics+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <immintrin.h>
+    #if defined(__has_attribute) && __has_attribute (target)
+    __attribute__((target("popcnt")))
+    #endif
+    static int x86_popcnt_test(void)
+    {
+      return _popcnt64(0);
+    }
+int
+main ()
+{
+return x86_popcnt_test();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_x86_popcnt_intrinsics=yes
+else
+  pgac_cv_x86_popcnt_intrinsics=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: 
$pgac_cv_x86_popcnt_intrinsics" >&5
+$as_echo "$pgac_cv_x86_popcnt_intrinsics" >&6; }
+if test x"$pgac_cv_x86_popcnt_intrinsics" = x"yes"; then
+  pgac_x86_popcnt_intrinsics=yes
+fi
+
+  if test x"$pgac_x86_popcnt_intrinsics" = x"yes"; then
+
+$as_echo "#define HAVE_X86_POPCNT_INTRINSICS 1" >>confdefs.h
+
+  fi
+fi
+
 # Check for AVX-512 popcount intrinsics
 #
 if test x"$host_cpu" = x"x86_64"; then
diff --git a/configure.ac b/configure.ac
index cc85c233c03..1594ee802b8 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1746,19 +1746,6 @@ AC_CHECK_TYPES([struct option], [], [],
 #endif])
 
 case $host_cpu in
-  x86_64)
-    # On x86_64, check if we can compile a popcntq instruction
-    AC_CACHE_CHECK([whether assembler supports x86_64 popcntq],
-                   [pgac_cv_have_x86_64_popcntq],
-    [AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],
-       [long long x = 1; long long r;
-        __asm__ __volatile__ (" popcntq %1,%0\n" : "=q"(r) : "rm"(x));])],
-       [pgac_cv_have_x86_64_popcntq=yes],
-       [pgac_cv_have_x86_64_popcntq=no])])
-    if test x"$pgac_cv_have_x86_64_popcntq" = xyes ; then
-       AC_DEFINE(HAVE_X86_64_POPCNTQ, 1, [Define to 1 if the assembler 
supports X86_64's POPCNTQ instruction.])
-    fi
-  ;;
   ppc*|powerpc*)
     # On PPC, check if compiler accepts "i"(x) when __builtin_constant_p(x).
     AC_CACHE_CHECK([whether __builtin_constant_p(x) implies "i"(x) acceptance],
@@ -1851,7 +1838,6 @@ PGAC_CHECK_BUILTIN_FUNC([__builtin_bswap64], [long int x])
 # We assume that we needn't test all widths of these explicitly:
 PGAC_CHECK_BUILTIN_FUNC([__builtin_clz], [unsigned int x])
 PGAC_CHECK_BUILTIN_FUNC([__builtin_ctz], [unsigned int x])
-PGAC_CHECK_BUILTIN_FUNC([__builtin_popcount], [unsigned int x])
 # __builtin_frame_address may draw a diagnostic for non-constant argument,
 # so it needs a different test function.
 PGAC_CHECK_BUILTIN_FUNC_PTR([__builtin_frame_address], [0])
@@ -2128,6 +2114,15 @@ if test x"$pgac_xsave_intrinsics" = x"yes"; then
   AC_DEFINE(HAVE_XSAVE_INTRINSICS, 1, [Define to 1 if you have XSAVE 
intrinsics.])
 fi
 
+# Check for x86 POPCNT intrinsics
+#
+if test x"$host_cpu" = x"x86_64"; then
+  PGAC_X86_POPCNT_INTRINSICS()
+  if test x"$pgac_x86_popcnt_intrinsics" = x"yes"; then
+    AC_DEFINE(HAVE_X86_POPCNT_INTRINSICS, 1, [Define to 1 if you have x86 
POPCNT intrinsics.])
+  fi
+fi
+
 # Check for AVX-512 popcount intrinsics
 #
 if test x"$host_cpu" = x"x86_64"; then
diff --git a/meson.build b/meson.build
index 055e96315d0..b44cfd3993a 100644
--- a/meson.build
+++ b/meson.build
@@ -2006,7 +2006,6 @@ builtins = [
   'ctz',
   'constant_p',
   'frame_address',
-  'popcount',
   'unreachable',
 ]
 
@@ -2377,6 +2376,31 @@ int main(void)
 endif
 
 
+###############################################################
+# Check for the availability of x86 POPCNT intrinsics.
+###############################################################
+
+if host_cpu == 'x86_64'
+
+  prog = '''
+#include <immintrin.h>
+
+#if defined(__has_attribute) && __has_attribute (target)
+__attribute__((target("popcnt")))
+#endif
+int main(void)
+{
+    return _popcnt64(0);
+}
+'''
+
+  if cc.links(prog, name: 'x86 POPCNT intrinsics', args: test_c_args)
+    cdata.set('HAVE_X86_POPCNT_INTRINSICS', 1)
+  endif
+
+endif
+
+
 ###############################################################
 # Check for the availability of AVX-512 popcount intrinsics.
 ###############################################################
@@ -2641,22 +2665,7 @@ endif
 # Other CPU specific stuff
 ###############################################################
 
-if host_cpu == 'x86_64'
-
-  if cc.get_id() == 'msvc'
-    cdata.set('HAVE_X86_64_POPCNTQ', 1)
-  elif cc.compiles('''
-      void main(void)
-      {
-          long long x = 1; long long r;
-          __asm__ __volatile__ (" popcntq %1,%0\n" : "=q"(r) : "rm"(x));
-      }''',
-      name: '@0@: popcntq instruction'.format(host_cpu),
-      args: test_c_args)
-    cdata.set('HAVE_X86_64_POPCNTQ', 1)
-  endif
-
-elif host_cpu == 'ppc' or host_cpu == 'ppc64'
+if host_cpu == 'ppc' or host_cpu == 'ppc64'
   # Check if compiler accepts "i"(x) when __builtin_constant_p(x).
   if cdata.has('HAVE__BUILTIN_CONSTANT_P')
     if cc.compiles('''
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 3824a5571bb..2f6d291d9b2 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -493,8 +493,8 @@
 /* Define to 1 if you have the `X509_get_signature_info' function. */
 #undef HAVE_X509_GET_SIGNATURE_INFO
 
-/* Define to 1 if the assembler supports X86_64's POPCNTQ instruction. */
-#undef HAVE_X86_64_POPCNTQ
+/* Define to 1 if you have x86 POPCNT intrinsics. */
+#undef HAVE_X86_POPCNT_INTRINSICS
 
 /* Define to 1 if you have the <xlocale.h> header file. */
 #undef HAVE_XLOCALE_H
@@ -526,9 +526,6 @@
 /* Define to 1 if your compiler understands __builtin_$op_overflow. */
 #undef HAVE__BUILTIN_OP_OVERFLOW
 
-/* Define to 1 if your compiler understands __builtin_popcount. */
-#undef HAVE__BUILTIN_POPCOUNT
-
 /* Define to 1 if your compiler understands __builtin_types_compatible_p. */
 #undef HAVE__BUILTIN_TYPES_COMPATIBLE_P
 
diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 789663edd93..53df0594823 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -279,7 +279,7 @@ pg_ceil_log2_64(uint64 num)
 extern uint64 pg_popcount_portable(const char *buf, int bytes);
 extern uint64 pg_popcount_masked_portable(const char *buf, int bytes, bits8 
mask);
 
-#if defined(HAVE_X86_64_POPCNTQ) || defined(USE_SVE_POPCNT_WITH_RUNTIME_CHECK)
+#if defined(HAVE_X86_POPCNT_INTRINSICS) || 
defined(USE_SVE_POPCNT_WITH_RUNTIME_CHECK)
 /*
  * Attempt to use specialized CPU instructions, but perform a runtime check
  * first.
@@ -297,51 +297,41 @@ extern uint64 pg_popcount_masked_optimized(const char 
*buf, int bytes, bits8 mas
 /*
  * pg_popcount32
  *             Return the number of 1 bits set in word
+ *
+ * Adapted from
+ * https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel.
+ *
+ * Note that newer versions of popular compilers will automatically replace
+ * this with a special popcount instruction if possible, so there isn't much
+ * reason to use builtin functions or intrinsics.
  */
 static inline int
 pg_popcount32(uint32 word)
 {
-#ifdef HAVE__BUILTIN_POPCOUNT
-       return __builtin_popcount(word);
-#else                                                  /* 
!HAVE__BUILTIN_POPCOUNT */
-       int                     result = 0;
-
-       while (word != 0)
-       {
-               result += pg_number_of_ones[word & 255];
-               word >>= 8;
-       }
-
-       return result;
-#endif                                                 /* 
HAVE__BUILTIN_POPCOUNT */
+       word -= (word >> 1) & 0x55555555;
+       word = (word & 0x33333333) + ((word >> 2) & 0x33333333);
+       return (((word + (word >> 4)) & 0xf0f0f0f) * 0x1010101) >> 24;
 }
 
 /*
  * pg_popcount64
  *             Return the number of 1 bits set in word
+ *
+ * Adapted from
+ * https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel.
+ *
+ * Note that newer versions of popular compilers will automatically replace
+ * this with a special popcount instruction if possible, so there isn't much
+ * reason to use builtin functions or intrinsics.
  */
 static inline int
 pg_popcount64(uint64 word)
 {
-#ifdef HAVE__BUILTIN_POPCOUNT
-#if SIZEOF_LONG == 8
-       return __builtin_popcountl(word);
-#elif SIZEOF_LONG_LONG == 8
-       return __builtin_popcountll(word);
-#else
-#error "cannot find integer of the same size as uint64_t"
-#endif
-#else                                                  /* 
!HAVE__BUILTIN_POPCOUNT */
-       int                     result = 0;
-
-       while (word != 0)
-       {
-               result += pg_number_of_ones[word & 255];
-               word >>= 8;
-       }
-
-       return result;
-#endif                                                 /* 
HAVE__BUILTIN_POPCOUNT */
+       word -= (word >> 1) & UINT64CONST(0x5555555555555555);
+       word = (word & UINT64CONST(0x3333333333333333)) +
+               ((word >> 2) & UINT64CONST(0x3333333333333333));
+       word = (word + (word >> 4)) & UINT64CONST(0xf0f0f0f0f0f0f0f);
+       return (word * UINT64CONST(0x101010101010101)) >> 56;
 }
 
 /*
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 49b130f1306..71327810119 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -162,7 +162,7 @@ pg_popcount_masked_portable(const char *buf, int bytes, 
bits8 mask)
        return popcnt;
 }
 
-#if !defined(HAVE_X86_64_POPCNTQ) && !defined(USE_NEON)
+#if !defined(HAVE_X86_POPCNT_INTRINSICS) && !defined(USE_NEON)
 
 /*
  * When special CPU instructions are not available, there's no point in using
@@ -191,4 +191,4 @@ pg_popcount_masked_optimized(const char *buf, int bytes, 
bits8 mask)
        return pg_popcount_masked_portable(buf, bytes, mask);
 }
 
-#endif                                                 /* ! 
HAVE_X86_64_POPCNTQ && ! USE_NEON */
+#endif                                                 /* ! 
HAVE_X86_POPCNT_INTRINSICS && ! USE_NEON */
diff --git a/src/port/pg_popcount_aarch64.c b/src/port/pg_popcount_aarch64.c
index f474ef45510..74f71593721 100644
--- a/src/port/pg_popcount_aarch64.c
+++ b/src/port/pg_popcount_aarch64.c
@@ -291,21 +291,6 @@ pg_popcount_masked_optimized(const char *buf, int bytes, 
bits8 mask)
 
 #endif                                                 /* ! 
USE_SVE_POPCNT_WITH_RUNTIME_CHECK */
 
-/*
- * pg_popcount64_neon
- *             Return number of 1 bits in word
- */
-static inline int
-pg_popcount64_neon(uint64 word)
-{
-       /*
-        * For some compilers, __builtin_popcountl() already emits Neon
-        * instructions.  The line below should compile to the same code on 
those
-        * systems.
-        */
-       return vaddv_u8(vcnt_u8(vld1_u8((const uint8 *) &word)));
-}
-
 /*
  * pg_popcount_neon
  *             Returns number of 1 bits in buf
@@ -373,7 +358,7 @@ pg_popcount_neon(const char *buf, int bytes)
         */
        for (; bytes >= sizeof(uint64); bytes -= sizeof(uint64))
        {
-               popcnt += pg_popcount64_neon(*((const uint64 *) buf));
+               popcnt += pg_popcount64(*((const uint64 *) buf));
                buf += sizeof(uint64);
        }
 
@@ -455,7 +440,7 @@ pg_popcount_masked_neon(const char *buf, int bytes, bits8 
mask)
         */
        for (; bytes >= sizeof(uint64); bytes -= sizeof(uint64))
        {
-               popcnt += pg_popcount64_neon(*((const uint64 *) buf) & mask64);
+               popcnt += pg_popcount64(*((const uint64 *) buf) & mask64);
                buf += sizeof(uint64);
        }
 
diff --git a/src/port/pg_popcount_x86.c b/src/port/pg_popcount_x86.c
index 6bce089432f..45ade1ee37b 100644
--- a/src/port/pg_popcount_x86.c
+++ b/src/port/pg_popcount_x86.c
@@ -12,7 +12,7 @@
  */
 #include "c.h"
 
-#ifdef HAVE_X86_64_POPCNTQ
+#ifdef HAVE_X86_POPCNT_INTRINSICS
 
 #if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
 #include <cpuid.h>
@@ -314,28 +314,12 @@ pg_popcount_masked_avx512(const char *buf, int bytes, 
bits8 mask)
 
 #endif                                                 /* 
USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
 
-/*
- * pg_popcount64_sse42
- *             Return the number of 1 bits set in word
- */
-static inline int
-pg_popcount64_sse42(uint64 word)
-{
-#ifdef _MSC_VER
-       return __popcnt64(word);
-#else
-       uint64          res;
-
-__asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc");
-       return (int) res;
-#endif
-}
-
 /*
  * pg_popcount_sse42
  *             Returns the number of 1-bits in buf
  */
 pg_attribute_no_sanitize_alignment()
+pg_attribute_target("popcnt")
 static uint64
 pg_popcount_sse42(const char *buf, int bytes)
 {
@@ -344,7 +328,7 @@ pg_popcount_sse42(const char *buf, int bytes)
 
        while (bytes >= 8)
        {
-               popcnt += pg_popcount64_sse42(*words++);
+               popcnt += pg_popcount64(*words++);
                bytes -= 8;
        }
 
@@ -362,6 +346,7 @@ pg_popcount_sse42(const char *buf, int bytes)
  *             Returns the number of 1-bits in buf after applying the mask to 
each byte
  */
 pg_attribute_no_sanitize_alignment()
+pg_attribute_target("popcnt")
 static uint64
 pg_popcount_masked_sse42(const char *buf, int bytes, bits8 mask)
 {
@@ -371,7 +356,7 @@ pg_popcount_masked_sse42(const char *buf, int bytes, bits8 
mask)
 
        while (bytes >= 8)
        {
-               popcnt += pg_popcount64_sse42(*words++ & maskv);
+               popcnt += pg_popcount64(*words++ & maskv);
                bytes -= 8;
        }
 
@@ -384,4 +369,4 @@ pg_popcount_masked_sse42(const char *buf, int bytes, bits8 
mask)
        return popcnt;
 }
 
-#endif                                                 /* HAVE_X86_64_POPCNTQ 
*/
+#endif                                                 /* 
HAVE_X86_POPCNT_INTRINSICS */
-- 
2.50.1 (Apple Git-155)

>From d5404599596b58df4d7809d54b99c6d37d34efdc Mon Sep 17 00:00:00 2001
From: Nathan Bossart <[email protected]>
Date: Tue, 10 Feb 2026 12:40:08 -0600
Subject: [PATCH v15 2/2] Make use of pg_popcount() in more places.

This replaces some loops over word-length popcount functions with
calls to pg_popcount().  Since pg_popcount() uses a function
pointer for inputs with sizes >= a Bitmapset word, this produces a
small regression for the common one-word case in bms_num_members().
To deal with that, this commit adds an inlined fast-path for that
case.  This fast-path could arguably go in pg_popcount() itself
(with an appropriate alignment check), but that is left for future
study.

Suggested-by: John Naylor <[email protected]>
Reviewed-by: John Naylor <[email protected]>
Discussion: 
https://postgr.es/m/CANWCAZY7R%2Biy%2Br9YM_sySNydHzNqUirx1xk0tB3ej5HO62GdgQ%40mail.gmail.com
---
 src/backend/nodes/bitmapset.c | 29 +++++++----------------------
 src/include/lib/radixtree.h   |  6 +++---
 2 files changed, 10 insertions(+), 25 deletions(-)

diff --git a/src/backend/nodes/bitmapset.c b/src/backend/nodes/bitmapset.c
index a4765876c31..786f343b3c9 100644
--- a/src/backend/nodes/bitmapset.c
+++ b/src/backend/nodes/bitmapset.c
@@ -553,14 +553,8 @@ bms_member_index(Bitmapset *a, int x)
        bitnum = BITNUM(x);
 
        /* count bits in preceding words */
-       for (int i = 0; i < wordnum; i++)
-       {
-               bitmapword      w = a->words[i];
-
-               /* No need to count the bits in a zero word */
-               if (w != 0)
-                       result += bmw_popcount(w);
-       }
+       result += pg_popcount((const char *) a->words,
+                                                 wordnum * sizeof(bitmapword));
 
        /*
         * Now add bits of the last word, but only those before the item. We can
@@ -749,26 +743,17 @@ bms_get_singleton_member(const Bitmapset *a, int *member)
 int
 bms_num_members(const Bitmapset *a)
 {
-       int                     result = 0;
-       int                     nwords;
-       int                     wordnum;
-
        Assert(bms_is_valid_set(a));
 
        if (a == NULL)
                return 0;
 
-       nwords = a->nwords;
-       wordnum = 0;
-       do
-       {
-               bitmapword      w = a->words[wordnum];
+       /* fast-path for common case */
+       if (a->nwords == 1)
+               return bmw_popcount(a->words[0]);
 
-               /* No need to count the bits in a zero word */
-               if (w != 0)
-                       result += bmw_popcount(w);
-       } while (++wordnum < nwords);
-       return result;
+       return pg_popcount((const char *) a->words,
+                                          a->nwords * sizeof(bitmapword));
 }
 
 /*
diff --git a/src/include/lib/radixtree.h b/src/include/lib/radixtree.h
index b223ce10a2d..e6c9a591c17 100644
--- a/src/include/lib/radixtree.h
+++ b/src/include/lib/radixtree.h
@@ -2721,12 +2721,12 @@ RT_VERIFY_NODE(RT_NODE * node)
                case RT_NODE_KIND_256:
                        {
                                RT_NODE_256 *n256 = (RT_NODE_256 *) node;
-                               int                     cnt = 0;
+                               int                     cnt;
 
                                /* RT_DUMP_NODE(node); */
 
-                               for (int i = 0; i < 
RT_BM_IDX(RT_NODE_MAX_SLOTS); i++)
-                                       cnt += bmw_popcount(n256->isset[i]);
+                               cnt = pg_popcount((const char *) n256->isset,
+                                                                 
RT_NODE_MAX_SLOTS / BITS_PER_BYTE);
 
                                /*
                                 * Check if the number of used chunk matches, 
accounting for
-- 
2.50.1 (Apple Git-155)

Reply via email to