On Sat, Feb 07, 2026 at 03:54:31PM +0700, John Naylor wrote: > Okay, this is looking good. I have just one more suggestion: For 0002, > just copy the word-wise functions verbatim. That way, it's a pure > refactoring commit and the exception doesn't need explaining. With > that, I'd say go ahead and commit 0001/2.
Seems reasonable. Here is an updated patch set. I've also swapped 0003 and 0004. > Then after a bit more research, the final form of the inline functions > can be visible in a single commit. I've tested S390X already and hope > to test one other platform. Thanks. Looking forward to the results. -- nathan
>From 5a99e6abc34902d9b640733ab829978112b5255b Mon Sep 17 00:00:00 2001 From: Nathan Bossart <[email protected]> Date: Fri, 6 Feb 2026 09:26:54 -0600 Subject: [PATCH v12 1/4] Remove some unnecessary optimizations in popcount code. Over the past few releases, we've added a huge amount of complexity to our popcount implementations. Commits fbe327e5b4, 79e232ca01, 8c6653516c, and 25dc485074 did some preliminary refactoring, but many opportunities remain. In particular, if we disclaim interest in micro-optimizing this code for 32-bit builds and in unproven alignment checks, we can remove a decent chunk of code. Suggested-by: John Naylor <[email protected]> Reviewed-by: John Naylor <[email protected]> Discussion: https://postgr.es/m/CANWCAZY7R%2Biy%2Br9YM_sySNydHzNqUirx1xk0tB3ej5HO62GdgQ%40mail.gmail.com --- src/include/port/pg_bitutils.h | 16 +------- src/port/pg_bitutils.c | 30 --------------- src/port/pg_popcount_x86.c | 67 ++++++---------------------------- 3 files changed, 14 insertions(+), 99 deletions(-) diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index 35761f509ec..20c11b79c61 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -333,13 +333,7 @@ pg_popcount(const char *buf, int bytes) * We set the threshold to the point at which we'll first use special * instructions in the optimized version. */ -#if SIZEOF_VOID_P >= 8 - int threshold = 8; -#else - int threshold = 4; -#endif - - if (bytes < threshold) + if (bytes < 8) { uint64 popcnt = 0; @@ -364,13 +358,7 @@ pg_popcount_masked(const char *buf, int bytes, bits8 mask) * We set the threshold to the point at which we'll first use special * instructions in the optimized version. */ -#if SIZEOF_VOID_P >= 8 - int threshold = 8; -#else - int threshold = 4; -#endif - - if (bytes < threshold) + if (bytes < 8) { uint64 popcnt = 0; diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index ffda75825e5..bec06c06fc3 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -167,20 +167,6 @@ pg_popcount_portable(const char *buf, int bytes) bytes -= 8; } - buf = (const char *) words; - } -#else - /* Process in 32-bit chunks if the buffer is aligned. */ - if (buf == (const char *) TYPEALIGN(4, buf)) - { - const uint32 *words = (const uint32 *) buf; - - while (bytes >= 4) - { - popcnt += pg_popcount32_portable(*words++); - bytes -= 4; - } - buf = (const char *) words; } #endif @@ -215,22 +201,6 @@ pg_popcount_masked_portable(const char *buf, int bytes, bits8 mask) bytes -= 8; } - buf = (const char *) words; - } -#else - /* Process in 32-bit chunks if the buffer is aligned. */ - uint32 maskv = ~((uint32) 0) / 0xFF * mask; - - if (buf == (const char *) TYPEALIGN(4, buf)) - { - const uint32 *words = (const uint32 *) buf; - - while (bytes >= 4) - { - popcnt += pg_popcount32_portable(*words++ & maskv); - bytes -= 4; - } - buf = (const char *) words; } #endif diff --git a/src/port/pg_popcount_x86.c b/src/port/pg_popcount_x86.c index 245f0167d00..7aebf69898b 100644 --- a/src/port/pg_popcount_x86.c +++ b/src/port/pg_popcount_x86.c @@ -376,40 +376,20 @@ __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc"); * pg_popcount_sse42 * Returns the number of 1-bits in buf */ +pg_attribute_no_sanitize_alignment() static uint64 pg_popcount_sse42(const char *buf, int bytes) { uint64 popcnt = 0; + const uint64 *words = (const uint64 *) buf; -#if SIZEOF_VOID_P >= 8 - /* Process in 64-bit chunks if the buffer is aligned. */ - if (buf == (const char *) TYPEALIGN(8, buf)) + while (bytes >= 8) { - const uint64 *words = (const uint64 *) buf; - - while (bytes >= 8) - { - popcnt += pg_popcount64_sse42(*words++); - bytes -= 8; - } - - buf = (const char *) words; + popcnt += pg_popcount64_sse42(*words++); + bytes -= 8; } -#else - /* Process in 32-bit chunks if the buffer is aligned. */ - if (buf == (const char *) TYPEALIGN(4, buf)) - { - const uint32 *words = (const uint32 *) buf; - while (bytes >= 4) - { - popcnt += pg_popcount32_sse42(*words++); - bytes -= 4; - } - - buf = (const char *) words; - } -#endif + buf = (const char *) words; /* Process any remaining bytes */ while (bytes--) @@ -422,44 +402,21 @@ pg_popcount_sse42(const char *buf, int bytes) * pg_popcount_masked_sse42 * Returns the number of 1-bits in buf after applying the mask to each byte */ +pg_attribute_no_sanitize_alignment() static uint64 pg_popcount_masked_sse42(const char *buf, int bytes, bits8 mask) { uint64 popcnt = 0; - -#if SIZEOF_VOID_P >= 8 - /* Process in 64-bit chunks if the buffer is aligned */ uint64 maskv = ~UINT64CONST(0) / 0xFF * mask; + const uint64 *words = (const uint64 *) buf; - if (buf == (const char *) TYPEALIGN(8, buf)) + while (bytes >= 8) { - const uint64 *words = (const uint64 *) buf; - - while (bytes >= 8) - { - popcnt += pg_popcount64_sse42(*words++ & maskv); - bytes -= 8; - } - - buf = (const char *) words; + popcnt += pg_popcount64_sse42(*words++ & maskv); + bytes -= 8; } -#else - /* Process in 32-bit chunks if the buffer is aligned. */ - uint32 maskv = ~((uint32) 0) / 0xFF * mask; - - if (buf == (const char *) TYPEALIGN(4, buf)) - { - const uint32 *words = (const uint32 *) buf; - - while (bytes >= 4) - { - popcnt += pg_popcount32_sse42(*words++ & maskv); - bytes -= 4; - } - buf = (const char *) words; - } -#endif + buf = (const char *) words; /* Process any remaining bytes */ while (bytes--) -- 2.50.1 (Apple Git-155)
>From 52b4969a5d13ef7ba7812a6686107cdfdddd0b7c Mon Sep 17 00:00:00 2001 From: Nathan Bossart <[email protected]> Date: Fri, 23 Jan 2026 17:31:20 -0600 Subject: [PATCH v12 2/4] Remove specialized word-length popcount implementations. The uses of these functions do not justify the level of micro-optimization we've done and may even hurt performance in some cases (e.g., due to using function pointers). This commit removes all architecture-specific implementations of pg_popcount{32,64}() and converts the portable ones to inlined functions in pg_bitutils.h. These inlined versions should produce the same code as before (but inlined), so in theory this is a net gain for many machines. Suggested-by: John Naylor <[email protected]> Reviewed-by: John Naylor <[email protected]> Reviewed-by: Greg Burd <[email protected]> Discussion: https://postgr.es/m/CANWCAZY7R%2Biy%2Br9YM_sySNydHzNqUirx1xk0tB3ej5HO62GdgQ%40mail.gmail.com --- src/include/port/pg_bitutils.h | 75 +++++++++++++++++++++++----------- src/port/pg_bitutils.c | 65 +---------------------------- src/port/pg_popcount_aarch64.c | 20 +++------ src/port/pg_popcount_x86.c | 43 +------------------ 4 files changed, 59 insertions(+), 144 deletions(-) diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index 20c11b79c61..789663edd93 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -276,46 +276,73 @@ pg_ceil_log2_64(uint64 num) return pg_leftmost_one_pos64(num - 1) + 1; } -extern int pg_popcount32_portable(uint32 word); -extern int pg_popcount64_portable(uint64 word); extern uint64 pg_popcount_portable(const char *buf, int bytes); extern uint64 pg_popcount_masked_portable(const char *buf, int bytes, bits8 mask); -#ifdef HAVE_X86_64_POPCNTQ +#if defined(HAVE_X86_64_POPCNTQ) || defined(USE_SVE_POPCNT_WITH_RUNTIME_CHECK) /* - * Attempt to use SSE4.2 or AVX-512 instructions, but perform a runtime check + * Attempt to use specialized CPU instructions, but perform a runtime check * first. */ -extern PGDLLIMPORT int (*pg_popcount32) (uint32 word); -extern PGDLLIMPORT int (*pg_popcount64) (uint64 word); extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes); extern PGDLLIMPORT uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask); -#elif defined(USE_NEON) -/* Use the Neon version of pg_popcount{32,64} without function pointer. */ -extern int pg_popcount32(uint32 word); -extern int pg_popcount64(uint64 word); - -/* - * We can try to use an SVE-optimized pg_popcount() on some systems For that, - * we do use a function pointer. - */ -#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK -extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes); -extern PGDLLIMPORT uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask); #else +/* Use a portable implementation -- no need for a function pointer. */ extern uint64 pg_popcount_optimized(const char *buf, int bytes); extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask); + #endif -#else -/* Use a portable implementation -- no need for a function pointer. */ -extern int pg_popcount32(uint32 word); -extern int pg_popcount64(uint64 word); -extern uint64 pg_popcount_optimized(const char *buf, int bytes); -extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask); +/* + * pg_popcount32 + * Return the number of 1 bits set in word + */ +static inline int +pg_popcount32(uint32 word) +{ +#ifdef HAVE__BUILTIN_POPCOUNT + return __builtin_popcount(word); +#else /* !HAVE__BUILTIN_POPCOUNT */ + int result = 0; + + while (word != 0) + { + result += pg_number_of_ones[word & 255]; + word >>= 8; + } + return result; +#endif /* HAVE__BUILTIN_POPCOUNT */ +} + +/* + * pg_popcount64 + * Return the number of 1 bits set in word + */ +static inline int +pg_popcount64(uint64 word) +{ +#ifdef HAVE__BUILTIN_POPCOUNT +#if SIZEOF_LONG == 8 + return __builtin_popcountl(word); +#elif SIZEOF_LONG_LONG == 8 + return __builtin_popcountll(word); +#else +#error "cannot find integer of the same size as uint64_t" #endif +#else /* !HAVE__BUILTIN_POPCOUNT */ + int result = 0; + + while (word != 0) + { + result += pg_number_of_ones[word & 255]; + word >>= 8; + } + + return result; +#endif /* HAVE__BUILTIN_POPCOUNT */ +} /* * Returns the number of 1-bits in buf. diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index bec06c06fc3..49b130f1306 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -96,56 +96,6 @@ const uint8 pg_number_of_ones[256] = { 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 }; -/* - * pg_popcount32_portable - * Return the number of 1 bits set in word - */ -int -pg_popcount32_portable(uint32 word) -{ -#ifdef HAVE__BUILTIN_POPCOUNT - return __builtin_popcount(word); -#else /* !HAVE__BUILTIN_POPCOUNT */ - int result = 0; - - while (word != 0) - { - result += pg_number_of_ones[word & 255]; - word >>= 8; - } - - return result; -#endif /* HAVE__BUILTIN_POPCOUNT */ -} - -/* - * pg_popcount64_portable - * Return the number of 1 bits set in word - */ -int -pg_popcount64_portable(uint64 word) -{ -#ifdef HAVE__BUILTIN_POPCOUNT -#if SIZEOF_LONG == 8 - return __builtin_popcountl(word); -#elif SIZEOF_LONG_LONG == 8 - return __builtin_popcountll(word); -#else -#error "cannot find integer of the same size as uint64_t" -#endif -#else /* !HAVE__BUILTIN_POPCOUNT */ - int result = 0; - - while (word != 0) - { - result += pg_number_of_ones[word & 255]; - word >>= 8; - } - - return result; -#endif /* HAVE__BUILTIN_POPCOUNT */ -} - /* * pg_popcount_portable * Returns the number of 1-bits in buf @@ -163,7 +113,7 @@ pg_popcount_portable(const char *buf, int bytes) while (bytes >= 8) { - popcnt += pg_popcount64_portable(*words++); + popcnt += pg_popcount64(*words++); bytes -= 8; } @@ -197,7 +147,7 @@ pg_popcount_masked_portable(const char *buf, int bytes, bits8 mask) while (bytes >= 8) { - popcnt += pg_popcount64_portable(*words++ & maskv); + popcnt += pg_popcount64(*words++ & maskv); bytes -= 8; } @@ -220,17 +170,6 @@ pg_popcount_masked_portable(const char *buf, int bytes, bits8 mask) * actual external functions. The compiler should be able to inline the * portable versions here. */ -int -pg_popcount32(uint32 word) -{ - return pg_popcount32_portable(word); -} - -int -pg_popcount64(uint64 word) -{ - return pg_popcount64_portable(word); -} /* * pg_popcount_optimized diff --git a/src/port/pg_popcount_aarch64.c b/src/port/pg_popcount_aarch64.c index ba57f2cd4bd..f474ef45510 100644 --- a/src/port/pg_popcount_aarch64.c +++ b/src/port/pg_popcount_aarch64.c @@ -292,21 +292,11 @@ pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask) #endif /* ! USE_SVE_POPCNT_WITH_RUNTIME_CHECK */ /* - * pg_popcount32 + * pg_popcount64_neon * Return number of 1 bits in word */ -int -pg_popcount32(uint32 word) -{ - return pg_popcount64((uint64) word); -} - -/* - * pg_popcount64 - * Return number of 1 bits in word - */ -int -pg_popcount64(uint64 word) +static inline int +pg_popcount64_neon(uint64 word) { /* * For some compilers, __builtin_popcountl() already emits Neon @@ -383,7 +373,7 @@ pg_popcount_neon(const char *buf, int bytes) */ for (; bytes >= sizeof(uint64); bytes -= sizeof(uint64)) { - popcnt += pg_popcount64(*((const uint64 *) buf)); + popcnt += pg_popcount64_neon(*((const uint64 *) buf)); buf += sizeof(uint64); } @@ -465,7 +455,7 @@ pg_popcount_masked_neon(const char *buf, int bytes, bits8 mask) */ for (; bytes >= sizeof(uint64); bytes -= sizeof(uint64)) { - popcnt += pg_popcount64(*((const uint64 *) buf) & mask64); + popcnt += pg_popcount64_neon(*((const uint64 *) buf) & mask64); buf += sizeof(uint64); } diff --git a/src/port/pg_popcount_x86.c b/src/port/pg_popcount_x86.c index 7aebf69898b..6bce089432f 100644 --- a/src/port/pg_popcount_x86.c +++ b/src/port/pg_popcount_x86.c @@ -36,8 +36,6 @@ * operation, but in practice this is close enough, and "sse42" seems easier to * follow than "popcnt" for these names. */ -static inline int pg_popcount32_sse42(uint32 word); -static inline int pg_popcount64_sse42(uint64 word); static uint64 pg_popcount_sse42(const char *buf, int bytes); static uint64 pg_popcount_masked_sse42(const char *buf, int bytes, bits8 mask); @@ -55,12 +53,8 @@ static uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask); * what the current CPU supports) and then will call the pointer to fulfill the * caller's request. */ -static int pg_popcount32_choose(uint32 word); -static int pg_popcount64_choose(uint64 word); static uint64 pg_popcount_choose(const char *buf, int bytes); static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask); -int (*pg_popcount32) (uint32 word) = pg_popcount32_choose; -int (*pg_popcount64) (uint64 word) = pg_popcount64_choose; uint64 (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose; uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose; @@ -157,7 +151,7 @@ pg_popcount_avx512_available(void) #endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */ /* - * These functions get called on the first call to pg_popcount32 etc. + * These functions get called on the first call to pg_popcount(), etc. * They detect whether we can use the asm implementations, and replace * the function pointers so that subsequent calls are routed directly to * the chosen implementation. @@ -167,15 +161,11 @@ choose_popcount_functions(void) { if (pg_popcount_sse42_available()) { - pg_popcount32 = pg_popcount32_sse42; - pg_popcount64 = pg_popcount64_sse42; pg_popcount_optimized = pg_popcount_sse42; pg_popcount_masked_optimized = pg_popcount_masked_sse42; } else { - pg_popcount32 = pg_popcount32_portable; - pg_popcount64 = pg_popcount64_portable; pg_popcount_optimized = pg_popcount_portable; pg_popcount_masked_optimized = pg_popcount_masked_portable; } @@ -189,20 +179,6 @@ choose_popcount_functions(void) #endif } -static int -pg_popcount32_choose(uint32 word) -{ - choose_popcount_functions(); - return pg_popcount32(word); -} - -static int -pg_popcount64_choose(uint64 word) -{ - choose_popcount_functions(); - return pg_popcount64(word); -} - static uint64 pg_popcount_choose(const char *buf, int bytes) { @@ -338,23 +314,6 @@ pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask) #endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */ -/* - * pg_popcount32_sse42 - * Return the number of 1 bits set in word - */ -static inline int -pg_popcount32_sse42(uint32 word) -{ -#ifdef _MSC_VER - return __popcnt(word); -#else - uint32 res; - -__asm__ __volatile__(" popcntl %1,%0\n":"=q"(res):"rm"(word):"cc"); - return (int) res; -#endif -} - /* * pg_popcount64_sse42 * Return the number of 1 bits set in word -- 2.50.1 (Apple Git-155)
>From 731d65d01cf1a1c70a6d6ef0bda24ebc5e0f024b Mon Sep 17 00:00:00 2001 From: Nathan Bossart <[email protected]> Date: Fri, 6 Feb 2026 10:00:21 -0600 Subject: [PATCH v12 3/4] Remove uses of popcount builtins. This commit replaces the implementations of pg_popcount{32,64} with branchless ones in plain C. While these new implementations do not make use of more sophisticated population count instructions available on some CPUs, testing indicates they perform well, especially now that they are inlined. A follow-up commit will replace various loops over these functions with calls to pg_popcount(), leaving us little reason to worry about micro-optimizing them further. Since this commit removes the only uses of the popcount builtins, we can also remove the corresponding configuration checks. Suggested-by: John Naylor <[email protected]> Reviewed-by: John Naylor <[email protected]> Discussion: https://postgr.es/m/CANWCAZY7R%2Biy%2Br9YM_sySNydHzNqUirx1xk0tB3ej5HO62GdgQ%40mail.gmail.com --- configure | 38 ---------------------------- configure.ac | 1 - meson.build | 1 - src/include/pg_config.h.in | 3 --- src/include/port/pg_bitutils.h | 46 +++++++++++----------------------- src/port/pg_popcount_aarch64.c | 5 ---- 6 files changed, 14 insertions(+), 80 deletions(-) diff --git a/configure b/configure index a10a2c85c6a..8fe368b7201 100755 --- a/configure +++ b/configure @@ -15878,44 +15878,6 @@ cat >>confdefs.h <<_ACEOF #define HAVE__BUILTIN_CTZ 1 _ACEOF -fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_popcount" >&5 -$as_echo_n "checking for __builtin_popcount... " >&6; } -if ${pgac_cv__builtin_popcount+:} false; then : - $as_echo_n "(cached) " >&6 -else - cat confdefs.h - <<_ACEOF >conftest.$ac_ext -/* end confdefs.h. */ - -int -call__builtin_popcount(unsigned int x) -{ - return __builtin_popcount(x); -} -int -main () -{ - - ; - return 0; -} -_ACEOF -if ac_fn_c_try_link "$LINENO"; then : - pgac_cv__builtin_popcount=yes -else - pgac_cv__builtin_popcount=no -fi -rm -f core conftest.err conftest.$ac_objext \ - conftest$ac_exeext conftest.$ac_ext -fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__builtin_popcount" >&5 -$as_echo "$pgac_cv__builtin_popcount" >&6; } -if test x"${pgac_cv__builtin_popcount}" = xyes ; then - -cat >>confdefs.h <<_ACEOF -#define HAVE__BUILTIN_POPCOUNT 1 -_ACEOF - fi # __builtin_frame_address may draw a diagnostic for non-constant argument, # so it needs a different test function. diff --git a/configure.ac b/configure.ac index 814e64a967e..f569b1c3f35 100644 --- a/configure.ac +++ b/configure.ac @@ -1852,7 +1852,6 @@ PGAC_CHECK_BUILTIN_FUNC([__builtin_bswap64], [long int x]) # We assume that we needn't test all widths of these explicitly: PGAC_CHECK_BUILTIN_FUNC([__builtin_clz], [unsigned int x]) PGAC_CHECK_BUILTIN_FUNC([__builtin_ctz], [unsigned int x]) -PGAC_CHECK_BUILTIN_FUNC([__builtin_popcount], [unsigned int x]) # __builtin_frame_address may draw a diagnostic for non-constant argument, # so it needs a different test function. PGAC_CHECK_BUILTIN_FUNC_PTR([__builtin_frame_address], [0]) diff --git a/meson.build b/meson.build index 96b3869df86..c89293cd80f 100644 --- a/meson.build +++ b/meson.build @@ -2004,7 +2004,6 @@ builtins = [ 'ctz', 'constant_p', 'frame_address', - 'popcount', 'unreachable', ] diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 339268dc8ef..2651b56ae4d 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -526,9 +526,6 @@ /* Define to 1 if your compiler understands __builtin_$op_overflow. */ #undef HAVE__BUILTIN_OP_OVERFLOW -/* Define to 1 if your compiler understands __builtin_popcount. */ -#undef HAVE__BUILTIN_POPCOUNT - /* Define to 1 if your compiler understands __builtin_types_compatible_p. */ #undef HAVE__BUILTIN_TYPES_COMPATIBLE_P diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index 789663edd93..c9b1f5f17dc 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -297,51 +297,33 @@ extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mas /* * pg_popcount32 * Return the number of 1 bits set in word + * + * Adapted from + * https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel. */ static inline int pg_popcount32(uint32 word) { -#ifdef HAVE__BUILTIN_POPCOUNT - return __builtin_popcount(word); -#else /* !HAVE__BUILTIN_POPCOUNT */ - int result = 0; - - while (word != 0) - { - result += pg_number_of_ones[word & 255]; - word >>= 8; - } - - return result; -#endif /* HAVE__BUILTIN_POPCOUNT */ + word -= (word >> 1) & 0x55555555; + word = (word & 0x33333333) + ((word >> 2) & 0x33333333); + return (((word + (word >> 4)) & 0xf0f0f0f) * 0x1010101) >> 24; } /* * pg_popcount64 * Return the number of 1 bits set in word + * + * Adapted from + * https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel. */ static inline int pg_popcount64(uint64 word) { -#ifdef HAVE__BUILTIN_POPCOUNT -#if SIZEOF_LONG == 8 - return __builtin_popcountl(word); -#elif SIZEOF_LONG_LONG == 8 - return __builtin_popcountll(word); -#else -#error "cannot find integer of the same size as uint64_t" -#endif -#else /* !HAVE__BUILTIN_POPCOUNT */ - int result = 0; - - while (word != 0) - { - result += pg_number_of_ones[word & 255]; - word >>= 8; - } - - return result; -#endif /* HAVE__BUILTIN_POPCOUNT */ + word -= (word >> 1) & UINT64CONST(0x5555555555555555); + word = (word & UINT64CONST(0x3333333333333333)) + + ((word >> 2) & UINT64CONST(0x3333333333333333)); + word = (word + (word >> 4)) & UINT64CONST(0xf0f0f0f0f0f0f0f); + return (word * UINT64CONST(0x101010101010101)) >> 56; } /* diff --git a/src/port/pg_popcount_aarch64.c b/src/port/pg_popcount_aarch64.c index f474ef45510..b0f10ae07a4 100644 --- a/src/port/pg_popcount_aarch64.c +++ b/src/port/pg_popcount_aarch64.c @@ -298,11 +298,6 @@ pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask) static inline int pg_popcount64_neon(uint64 word) { - /* - * For some compilers, __builtin_popcountl() already emits Neon - * instructions. The line below should compile to the same code on those - * systems. - */ return vaddv_u8(vcnt_u8(vld1_u8((const uint8 *) &word))); } -- 2.50.1 (Apple Git-155)
>From f4097678d1207e5fe5360c1c54d71d4fa5b317ad Mon Sep 17 00:00:00 2001 From: Nathan Bossart <[email protected]> Date: Tue, 10 Feb 2026 12:40:08 -0600 Subject: [PATCH v12 4/4] Make use of pg_popcount() in more places. This replaces some loops over word-length popcount functions with calls to pg_popcount(). Since pg_popcount() uses a function pointer for inputs with sizes >= a Bitmapset word, this produces a small regression for the common one-word case in bms_num_members(). To deal with that, this commit adds an inlined fast-path for that case. This fast-path could arguably go in pg_popcount() itself (with an appropriate alignment check), but that is left for future study. Suggested-by: John Naylor <[email protected]> Reviewed-by: John Naylor <[email protected]> Discussion: https://postgr.es/m/CANWCAZY7R%2Biy%2Br9YM_sySNydHzNqUirx1xk0tB3ej5HO62GdgQ%40mail.gmail.com --- src/backend/nodes/bitmapset.c | 29 +++++++---------------------- src/include/lib/radixtree.h | 6 +++--- 2 files changed, 10 insertions(+), 25 deletions(-) diff --git a/src/backend/nodes/bitmapset.c b/src/backend/nodes/bitmapset.c index a4765876c31..786f343b3c9 100644 --- a/src/backend/nodes/bitmapset.c +++ b/src/backend/nodes/bitmapset.c @@ -553,14 +553,8 @@ bms_member_index(Bitmapset *a, int x) bitnum = BITNUM(x); /* count bits in preceding words */ - for (int i = 0; i < wordnum; i++) - { - bitmapword w = a->words[i]; - - /* No need to count the bits in a zero word */ - if (w != 0) - result += bmw_popcount(w); - } + result += pg_popcount((const char *) a->words, + wordnum * sizeof(bitmapword)); /* * Now add bits of the last word, but only those before the item. We can @@ -749,26 +743,17 @@ bms_get_singleton_member(const Bitmapset *a, int *member) int bms_num_members(const Bitmapset *a) { - int result = 0; - int nwords; - int wordnum; - Assert(bms_is_valid_set(a)); if (a == NULL) return 0; - nwords = a->nwords; - wordnum = 0; - do - { - bitmapword w = a->words[wordnum]; + /* fast-path for common case */ + if (a->nwords == 1) + return bmw_popcount(a->words[0]); - /* No need to count the bits in a zero word */ - if (w != 0) - result += bmw_popcount(w); - } while (++wordnum < nwords); - return result; + return pg_popcount((const char *) a->words, + a->nwords * sizeof(bitmapword)); } /* diff --git a/src/include/lib/radixtree.h b/src/include/lib/radixtree.h index b223ce10a2d..e6c9a591c17 100644 --- a/src/include/lib/radixtree.h +++ b/src/include/lib/radixtree.h @@ -2721,12 +2721,12 @@ RT_VERIFY_NODE(RT_NODE * node) case RT_NODE_KIND_256: { RT_NODE_256 *n256 = (RT_NODE_256 *) node; - int cnt = 0; + int cnt; /* RT_DUMP_NODE(node); */ - for (int i = 0; i < RT_BM_IDX(RT_NODE_MAX_SLOTS); i++) - cnt += bmw_popcount(n256->isset[i]); + cnt = pg_popcount((const char *) n256->isset, + RT_NODE_MAX_SLOTS / BITS_PER_BYTE); /* * Check if the number of used chunk matches, accounting for -- 2.50.1 (Apple Git-155)
