I've been preparing these for commit, and I've attached what I have so far. A few notes:
* 0001 just renames the TRY_POPCNT_FAST macro to indicate that it's x86_64-specific. IMO this is worth doing indpendent of this patch set, but it's more important with the patch set since we need something similar for Aarch64. I think we should also consider moving the x86_64 stuff to its own file (perhaps combining it with the AVX-512 stuff), but that can probably wait until later. * 0002 introduces the Neon implementation, which conveniently doesn't need configure-time checks or function pointers. I noticed that some compilers (e.g., Apple clang 16) compile in Neon instructions already, but our hand-rolled implementation is better about instruction-level parallelism and seems to still be quite a bit faster. * 0003 introduces the SVE implementation. You'll notice I've moved all the function pointer gymnastics into the pg_popcount_aarch64.c file, which is where the Neon implementations live, too. I also tried to clean up the configure checks a bit. I imagine it's possible to make them more compact, but I felt that the enhanced readability was worth it. * For both Neon and SVE, I do see improvements with looping over 4 registers at a time, so IMHO it's worth doing so even if it performs the same as 2-register blocks on some hardware. I did add a 2-register block in the Neon implementation for processing the tail because I was worried about its performance on smaller buffers, but that part might get removed if I can't measure any difference. I'm planning to run several more benchmarks, but everything I've seen thus far has looked pretty good. -- nathan
>From c14a62c26196731aa2379babf535e698260f0066 Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nat...@postgresql.org> Date: Fri, 21 Mar 2025 09:47:30 -0500 Subject: [PATCH v8 1/3] Rename TRY_POPCNT_FAST to POPCNT_X86_64. --- src/include/port/pg_bitutils.h | 6 +++--- src/port/pg_bitutils.c | 14 +++++++------- src/port/pg_popcount_avx512.c | 8 ++++---- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index 62554ce685a..70bf65c04e4 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -294,11 +294,11 @@ pg_ceil_log2_64(uint64 num) */ #ifdef HAVE_X86_64_POPCNTQ #if defined(HAVE__GET_CPUID) || defined(HAVE__CPUID) -#define TRY_POPCNT_FAST 1 +#define POPCNT_X86_64 1 #endif #endif -#ifdef TRY_POPCNT_FAST +#ifdef POPCNT_X86_64 /* Attempt to use the POPCNT instruction, but perform a runtime check first */ extern PGDLLIMPORT int (*pg_popcount32) (uint32 word); extern PGDLLIMPORT int (*pg_popcount64) (uint64 word); @@ -322,7 +322,7 @@ extern int pg_popcount64(uint64 word); extern uint64 pg_popcount_optimized(const char *buf, int bytes); extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask); -#endif /* TRY_POPCNT_FAST */ +#endif /* POPCNT_X86_64 */ /* * Returns the number of 1-bits in buf. diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index 5677525693d..34904c2fbd9 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -108,7 +108,7 @@ static inline int pg_popcount64_slow(uint64 word); static uint64 pg_popcount_slow(const char *buf, int bytes); static uint64 pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask); -#ifdef TRY_POPCNT_FAST +#ifdef POPCNT_X86_64 static bool pg_popcount_available(void); static int pg_popcount32_choose(uint32 word); static int pg_popcount64_choose(uint64 word); @@ -123,9 +123,9 @@ int (*pg_popcount32) (uint32 word) = pg_popcount32_choose; int (*pg_popcount64) (uint64 word) = pg_popcount64_choose; uint64 (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose; uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose; -#endif /* TRY_POPCNT_FAST */ +#endif /* POPCNT_X86_64 */ -#ifdef TRY_POPCNT_FAST +#ifdef POPCNT_X86_64 /* * Return true if CPUID indicates that the POPCNT instruction is available. @@ -337,7 +337,7 @@ pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask) return popcnt; } -#endif /* TRY_POPCNT_FAST */ +#endif /* POPCNT_X86_64 */ /* @@ -486,13 +486,13 @@ pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask) return popcnt; } -#ifndef TRY_POPCNT_FAST +#ifndef POPCNT_X86_64 /* * When the POPCNT instruction is not available, there's no point in using * function pointers to vary the implementation between the fast and slow * method. We instead just make these actual external functions when - * TRY_POPCNT_FAST is not defined. The compiler should be able to inline + * POPCNT_X86_64 is not defined. The compiler should be able to inline * the slow versions here. */ int @@ -527,4 +527,4 @@ pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask) return pg_popcount_masked_slow(buf, bytes, mask); } -#endif /* !TRY_POPCNT_FAST */ +#endif /* !POPCNT_X86_64 */ diff --git a/src/port/pg_popcount_avx512.c b/src/port/pg_popcount_avx512.c index dac895a0fc2..63f697ebea8 100644 --- a/src/port/pg_popcount_avx512.c +++ b/src/port/pg_popcount_avx512.c @@ -27,11 +27,11 @@ #include "port/pg_bitutils.h" /* - * It's probably unlikely that TRY_POPCNT_FAST won't be set if we are able to + * It's probably unlikely that POPCNT_X86_64 won't be set if we are able to * use AVX-512 intrinsics, but we check it anyway to be sure. We piggy-back on - * the function pointers that are only used when TRY_POPCNT_FAST is set. + * the function pointers that are only used when POPCNT_X86_64 is set. */ -#ifdef TRY_POPCNT_FAST +#ifdef POPCNT_X86_64 /* * Does CPUID say there's support for XSAVE instructions? @@ -219,5 +219,5 @@ pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask) return _mm512_reduce_add_epi64(accum); } -#endif /* TRY_POPCNT_FAST */ +#endif /* POPCNT_X86_64 */ #endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */ -- 2.39.5 (Apple Git-154)
>From 3ebc1321e6782919980d3410d3bc527fd77751fc Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nat...@postgresql.org> Date: Fri, 21 Mar 2025 11:04:26 -0500 Subject: [PATCH v8 2/3] Neon popcount support. --- src/include/port/pg_bitutils.h | 9 ++ src/port/Makefile | 1 + src/port/meson.build | 1 + src/port/pg_bitutils.c | 22 +++- src/port/pg_popcount_aarch64.c | 203 +++++++++++++++++++++++++++++++++ 5 files changed, 230 insertions(+), 6 deletions(-) create mode 100644 src/port/pg_popcount_aarch64.c diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index 70bf65c04e4..9aa07e5d574 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -298,6 +298,15 @@ pg_ceil_log2_64(uint64 num) #endif #endif +/* + * On AArch64, we can use Neon instructions if the compiler provides access to + * them (as indicated by __ARM_NEON). As in simd.h, we assume that all + * available 64-bit hardware has Neon support. + */ +#if defined(__aarch64__) && defined(__ARM_NEON) +#define POPCNT_AARCH64 1 +#endif + #ifdef POPCNT_X86_64 /* Attempt to use the POPCNT instruction, but perform a runtime check first */ extern PGDLLIMPORT int (*pg_popcount32) (uint32 word); diff --git a/src/port/Makefile b/src/port/Makefile index 4c224319512..cb86b7141e6 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -44,6 +44,7 @@ OBJS = \ noblock.o \ path.o \ pg_bitutils.o \ + pg_popcount_aarch64.o \ pg_popcount_avx512.o \ pg_strong_random.o \ pgcheckdir.o \ diff --git a/src/port/meson.build b/src/port/meson.build index 7fcfa728d43..cad0dd8f4f8 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -7,6 +7,7 @@ pgport_sources = [ 'noblock.c', 'path.c', 'pg_bitutils.c', + 'pg_popcount_aarch64.c', 'pg_popcount_avx512.c', 'pg_strong_random.c', 'pgcheckdir.c', diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index 34904c2fbd9..8b6f20b54e9 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -103,10 +103,15 @@ const uint8 pg_number_of_ones[256] = { 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 }; +/* + * If we are building the Neon versions, we don't need the "slow" fallbacks. + */ +#ifndef POPCNT_AARCH64 static inline int pg_popcount32_slow(uint32 word); static inline int pg_popcount64_slow(uint64 word); static uint64 pg_popcount_slow(const char *buf, int bytes); static uint64 pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask); +#endif #ifdef POPCNT_X86_64 static bool pg_popcount_available(void); @@ -339,6 +344,10 @@ pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask) #endif /* POPCNT_X86_64 */ +/* + * If we are building the Neon versions, we don't need the "slow" fallbacks. + */ +#ifndef POPCNT_AARCH64 /* * pg_popcount32_slow @@ -486,14 +495,15 @@ pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask) return popcnt; } -#ifndef POPCNT_X86_64 +#endif /* ! POPCNT_AARCH64 */ + +#if !defined(POPCNT_X86_64) && !defined(POPCNT_AARCH64) /* - * When the POPCNT instruction is not available, there's no point in using + * When special CPU instructions are not available, there's no point in using * function pointers to vary the implementation between the fast and slow - * method. We instead just make these actual external functions when - * POPCNT_X86_64 is not defined. The compiler should be able to inline - * the slow versions here. + * method. We instead just make these actual external functions. The compiler + * should be able to inline the slow versions here. */ int pg_popcount32(uint32 word) @@ -527,4 +537,4 @@ pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask) return pg_popcount_masked_slow(buf, bytes, mask); } -#endif /* !POPCNT_X86_64 */ +#endif /* ! POPCNT_X86_64 && ! POPCNT_AARCH64 */ diff --git a/src/port/pg_popcount_aarch64.c b/src/port/pg_popcount_aarch64.c new file mode 100644 index 00000000000..426bae660ef --- /dev/null +++ b/src/port/pg_popcount_aarch64.c @@ -0,0 +1,203 @@ +/*------------------------------------------------------------------------- + * + * pg_popcount_aarc64.c + * Holds the AArch64 pg_popcount() implementations. + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/port/pg_popcount_aarch64.c + * + *------------------------------------------------------------------------- + */ +#include "c.h" + +#include "port/pg_bitutils.h" + +#ifdef POPCNT_AARCH64 + +#include <arm_neon.h> + +/* + * pg_popcount32 + * Return number of 1 bits in word + */ +int +pg_popcount32(uint32 word) +{ + return pg_popcount64((uint64) word); +} + +/* + * pg_popcount64 + * Return number of 1 bits in word + */ +int +pg_popcount64(uint64 word) +{ + return vaddv_u8(vcnt_u8(vld1_u8((const uint8 *) &word))); +} + +/* + * pg_popcount_optimized + * Returns number of 1 bits in buf + */ +uint64 +pg_popcount_optimized(const char *buf, int bytes) +{ + uint8x16_t vec; + uint32 bytes_per_iteration = 4 * sizeof(uint8x16_t); + uint64x2_t accum1 = vdupq_n_u64(0), + accum2 = vdupq_n_u64(0), + accum3 = vdupq_n_u64(0), + accum4 = vdupq_n_u64(0); + uint64 popcnt = 0; + + /* + * For better instruction-level parallelism, each loop iteration operates + * on a block of four registers. + */ + for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration) + { + vec = vld1q_u8((const uint8 *) buf); + accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec)))); + buf += sizeof(uint8x16_t); + + vec = vld1q_u8((const uint8 *) buf); + accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec)))); + buf += sizeof(uint8x16_t); + + vec = vld1q_u8((const uint8 *) buf); + accum3 = vpadalq_u32(accum3, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec)))); + buf += sizeof(uint8x16_t); + + vec = vld1q_u8((const uint8 *) buf); + accum4 = vpadalq_u32(accum4, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec)))); + buf += sizeof(uint8x16_t); + } + + /* + * If enough data remains, do another iteration on a block of two + * registers. + */ + bytes_per_iteration = 2 * sizeof(uint8x16_t); + if (bytes >= bytes_per_iteration) + { + vec = vld1q_u8((const uint8 *) buf); + accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec)))); + buf += sizeof(uint8x16_t); + + vec = vld1q_u8((const uint8 *) buf); + accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec)))); + buf += sizeof(uint8x16_t); + + bytes -= bytes_per_iteration; + } + + /* + * Add the accumulators. + */ + popcnt += vaddvq_u64(vaddq_u64(accum1, accum2)); + popcnt += vaddvq_u64(vaddq_u64(accum3, accum4)); + + /* + * Process remaining 8-byte blocks. + */ + for (; bytes >= sizeof(uint64); bytes -= sizeof(uint64)) + { + popcnt += pg_popcount64(*((uint64 *) buf)); + buf += sizeof(uint64); + } + + /* + * Process any remaining data byte-by-byte. + */ + while (bytes--) + popcnt += pg_number_of_ones[(unsigned char) *buf++]; + + return popcnt; +} + +/* + * pg_popcount_masked_optimized + * Returns number of 1 bits in buf after applying the mask to each byte + */ +uint64 +pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask) +{ + uint8x16_t vec; + uint32 bytes_per_iteration = 4 * sizeof(uint8x16_t); + uint64x2_t accum1 = vdupq_n_u64(0), + accum2 = vdupq_n_u64(0), + accum3 = vdupq_n_u64(0), + accum4 = vdupq_n_u64(0); + uint64 popcnt = 0, + mask64 = ~UINT64CONST(0) / 0xFF * mask; + uint8x16_t maskv = vdupq_n_u8(mask); + + /* + * For better instruction-level parallelism, each loop iteration operates + * on a block of four registers. + */ + for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration) + { + vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv); + accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec)))); + buf += sizeof(uint8x16_t); + + vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv); + accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec)))); + buf += sizeof(uint8x16_t); + + vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv); + accum3 = vpadalq_u32(accum3, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec)))); + buf += sizeof(uint8x16_t); + + vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv); + accum4 = vpadalq_u32(accum4, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec)))); + buf += sizeof(uint8x16_t); + } + + /* + * If enough data remains, do another iteration on a block of two + * registers. + */ + bytes_per_iteration = 2 * sizeof(uint8x16_t); + if (bytes >= bytes_per_iteration) + { + vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv); + accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec)))); + buf += sizeof(uint8x16_t); + + vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv); + accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec)))); + buf += sizeof(uint8x16_t); + + bytes -= bytes_per_iteration; + } + + /* + * Add the accumulators. + */ + popcnt += vaddvq_u64(vaddq_u64(accum1, accum2)); + popcnt += vaddvq_u64(vaddq_u64(accum3, accum4)); + + /* + * Process remining 8-byte blocks. + */ + for (; bytes >= sizeof(uint64); bytes -= sizeof(uint64)) + { + popcnt += pg_popcount64(*((uint64 *) buf) & mask64); + buf += sizeof(uint64); + } + + /* + * Process any remaining data byte-by-byte. + */ + while (bytes--) + popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask]; + + return popcnt; +} + +#endif /* POPCNT_AARCH64 */ -- 2.39.5 (Apple Git-154)
>From 36f954a5735911af3e057f24d8803c32819e738d Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nat...@postgresql.org> Date: Fri, 21 Mar 2025 20:24:44 -0500 Subject: [PATCH v8 3/3] SVE popcount support. --- config/c-compiler.m4 | 64 +++++++++ configure | 84 ++++++++++++ configure.ac | 9 ++ meson.build | 61 +++++++++ src/include/pg_config.h.in | 3 + src/include/port/pg_bitutils.h | 17 +++ src/port/pg_popcount_aarch64.c | 235 ++++++++++++++++++++++++++++++++- 7 files changed, 467 insertions(+), 6 deletions(-) diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 3712e81e38c..d1e7461f6f6 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -708,3 +708,67 @@ if test x"$Ac_cachevar" = x"yes"; then fi undefine([Ac_cachevar])dnl ])# PGAC_AVX512_POPCNT_INTRINSICS + +# PGAC_SVE_POPCNT_INTRINSICS +# -------------------------- +# Check if the compiler supports the SVE popcount instructions using the +# svptrue_b64, svdup_u64, svcntb, svld1, svadd_x, svcnt_x, svaddv, +# svwhilelt_b8, and svand_x intrinsic functions. +# +# If the intrinsics are supported, sets pgac_sve_popcnt_intrinsics. +AC_DEFUN([PGAC_SVE_POPCNT_INTRINSICS], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sve_popcnt_intrinsics])])dnl +AC_CACHE_CHECK([for svcnt_x], [Ac_cachevar], +[AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include <arm_sve.h> + + char buf[500]; + + #if defined(__has_attribute) && __has_attribute (target) + __attribute__((target("arch=armv8-a+sve"))) + #endif + static int popcount_test(void) + { + uint32_t vec_len = svcntb(); + int bytes = sizeof(buf); + svuint64_t accum1 = svdup_u64(0), + accum2 = svdup_u64(0); + svbool_t pred = svptrue_b64(); + uint64_t popcnt = 0, + mask = 0x5555555555555555; + char *p = buf; + + for (; bytes >= vec_len * 2; bytes -= vec_len * 2) + { + svuint64_t vec; + + vec = svand_x(pred, svld1(pred, (const uint64_t *) p), mask); + accum1 = svadd_x(pred, accum1, svcnt_x(pred, vec)); + p += vec_len; + + vec = svand_x(pred, svld1(pred, (const uint64_t *) p), mask); + accum2 = svadd_x(pred, accum2, svcnt_x(pred, vec)); + p += vec_len; + } + + popcnt += svaddv(pred, svadd_x(pred, accum1, accum2)); + + for (; bytes >= vec_len; bytes -= vec_len) + { + svuint8_t vec; + + pred = svwhilelt_b8(0, bytes); + vec = svand_x(pred, svld1(pred, (const uint8_t *) p), 0x55); + popcnt += svaddv(pred, svcnt_x(pred, vec)); + p += vec_len; + } + + return (int) popcnt; + }]], + [return popcount_test();])], + [Ac_cachevar=yes], + [Ac_cachevar=no])]) +if test x"$Ac_cachevar" = x"yes"; then + pgac_sve_popcnt_intrinsics=yes +fi +undefine([Ac_cachevar])dnl +])# PGAC_SVE_POPCNT_INTRINSICS diff --git a/configure b/configure index fac1e9a4e39..85f4b24caaa 100755 --- a/configure +++ b/configure @@ -17378,6 +17378,90 @@ $as_echo "#define USE_AVX512_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h fi fi +# Check for SVE popcount intrinsics +# +if test x"$host_cpu" = x"aarch64"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for svcnt_x" >&5 +$as_echo_n "checking for svcnt_x... " >&6; } +if ${pgac_cv_sve_popcnt_intrinsics+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include <arm_sve.h> + + char buf[500]; + + #if defined(__has_attribute) && __has_attribute (target) + __attribute__((target("arch=armv8-a+sve"))) + #endif + static int popcount_test(void) + { + uint32_t vec_len = svcntb(); + int bytes = sizeof(buf); + svuint64_t accum1 = svdup_u64(0), + accum2 = svdup_u64(0); + svbool_t pred = svptrue_b64(); + uint64_t popcnt = 0, + mask = 0x5555555555555555; + char *p = buf; + + for (; bytes >= vec_len * 2; bytes -= vec_len * 2) + { + svuint64_t vec; + + vec = svand_x(pred, svld1(pred, (const uint64_t *) p), mask); + accum1 = svadd_x(pred, accum1, svcnt_x(pred, vec)); + p += vec_len; + + vec = svand_x(pred, svld1(pred, (const uint64_t *) p), mask); + accum2 = svadd_x(pred, accum2, svcnt_x(pred, vec)); + p += vec_len; + } + + popcnt += svaddv(pred, svadd_x(pred, accum1, accum2)); + + for (; bytes >= vec_len; bytes -= vec_len) + { + svuint8_t vec; + + pred = svwhilelt_b8(0, bytes); + vec = svand_x(pred, svld1(pred, (const uint8_t *) p), 0x55); + popcnt += svaddv(pred, svcnt_x(pred, vec)); + p += vec_len; + } + + return (int) popcnt; + } +int +main () +{ +return popcount_test(); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_sve_popcnt_intrinsics=yes +else + pgac_cv_sve_popcnt_intrinsics=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sve_popcnt_intrinsics" >&5 +$as_echo "$pgac_cv_sve_popcnt_intrinsics" >&6; } +if test x"$pgac_cv_sve_popcnt_intrinsics" = x"yes"; then + pgac_sve_popcnt_intrinsics=yes +fi + + if test x"$pgac_sve_popcnt_intrinsics" = x"yes"; then + +$as_echo "#define USE_SVE_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h + + fi +fi + # Check for Intel SSE 4.2 intrinsics to do CRC calculations. # { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32" >&5 diff --git a/configure.ac b/configure.ac index b6d02f5ecc7..64b52940658 100644 --- a/configure.ac +++ b/configure.ac @@ -2057,6 +2057,15 @@ if test x"$host_cpu" = x"x86_64"; then fi fi +# Check for SVE popcount intrinsics +# +if test x"$host_cpu" = x"aarch64"; then + PGAC_SVE_POPCNT_INTRINSICS() + if test x"$pgac_sve_popcnt_intrinsics" = x"yes"; then + AC_DEFINE(USE_SVE_POPCNT_WITH_RUNTIME_CHECK, 1, [Define to 1 to use SVE popcount instructions with a runtime check.]) + fi +fi + # Check for Intel SSE 4.2 intrinsics to do CRC calculations. # PGAC_SSE42_CRC32_INTRINSICS() diff --git a/meson.build b/meson.build index 7cf518a2765..de7e695ab6f 100644 --- a/meson.build +++ b/meson.build @@ -2285,6 +2285,67 @@ int main(void) endif +############################################################### +# Check for the availability of SVE popcount intrinsics. +############################################################### + +if host_cpu == 'aarch64' + + prog = ''' +#include <arm_sve.h> + +char buf[500]; + +#if defined(__has_attribute) && __has_attribute (target) +__attribute__((target("arch=armv8-a+sve"))) +#endif +int main(void) +{ + uint32_t vec_len = svcntb(); + int bytes = sizeof(buf); + svuint64_t accum1 = svdup_u64(0), + accum2 = svdup_u64(0); + svbool_t pred = svptrue_b64(); + uint64_t popcnt = 0, + mask = 0x5555555555555555; + char *p = buf; + + for (; bytes >= vec_len * 2; bytes -= vec_len * 2) + { + svuint64_t vec; + + vec = svand_x(pred, svld1(pred, (const uint64_t *) p), mask); + accum1 = svadd_x(pred, accum1, svcnt_x(pred, vec)); + p += vec_len; + + vec = svand_x(pred, svld1(pred, (const uint64_t *) p), mask); + accum2 = svadd_x(pred, accum2, svcnt_x(pred, vec)); + p += vec_len; + } + + popcnt += svaddv(pred, svadd_x(pred, accum1, accum2)); + + for (; bytes >= vec_len; bytes -= vec_len) + { + svuint8_t vec; + + pred = svwhilelt_b8(0, bytes); + vec = svand_x(pred, svld1(pred, (const uint8_t *) p), 0x55); + popcnt += svaddv(pred, svcnt_x(pred, vec)); + p += vec_len; + } + + return (int) popcnt; +} +''' + + if cc.links(prog, name: 'SVE popcount', args: test_c_args) + cdata.set('USE_SVE_POPCNT_WITH_RUNTIME_CHECK', 1) + endif + +endif + + ############################################################### # Select CRC-32C implementation. # diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index db6454090d2..2a67db077a9 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -706,6 +706,9 @@ /* Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check. */ #undef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK +/* Define to 1 to use SVE popcount instructions with a runtime check. */ +#undef USE_SVE_POPCNT_WITH_RUNTIME_CHECK + /* Define to build with systemd support. (--with-systemd) */ #undef USE_SYSTEMD diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index 9aa07e5d574..1bcb4ecb8ab 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -324,6 +324,23 @@ extern uint64 pg_popcount_avx512(const char *buf, int bytes); extern uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask); #endif +#elif POPCNT_AARCH64 +/* Use the Neon version of pg_popcount{32,64} without function pointer. */ +extern int pg_popcount32(uint32 word); +extern int pg_popcount64(uint64 word); + +/* + * We can try to use an SVE-optimized pg_popcount() on some systems For that, + * we do use a function pointer. + */ +#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK +extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes); +extern PGDLLIMPORT uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask); +#else +extern uint64 pg_popcount_optimized(const char *buf, int bytes); +extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask); +#endif + #else /* Use a portable implementation -- no need for a function pointer. */ extern int pg_popcount32(uint32 word); diff --git a/src/port/pg_popcount_aarch64.c b/src/port/pg_popcount_aarch64.c index 426bae660ef..48441269639 100644 --- a/src/port/pg_popcount_aarch64.c +++ b/src/port/pg_popcount_aarch64.c @@ -18,6 +18,229 @@ #include <arm_neon.h> +#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK +#include <arm_sve.h> + +#if defined(HAVE_ELF_AUX_INFO) || defined(HAVE_GETAUXVAL) +#include <sys/auxv.h> +#endif +#endif + +/* + * The Neon versions are built regardless of whether we are building the SVE + * versions. + */ +static uint64 pg_popcount_neon(const char *buf, int bytes); +static uint64 pg_popcount_masked_neon(const char *buf, int bytes, bits8 mask); + +#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK + +/* + * These are the SVE implementations of the popcount functions. + */ +static uint64 pg_popcount_sve(const char *buf, int bytes); +static uint64 pg_popcount_masked_sve(const char *buf, int bytes, bits8 mask); + +/* + * The function pointers are initially set to "choose" functions. These + * functions will first set the pointers to the right implementations (based on + * what the current CPU supports) and then will call the pointer to fulfill the + * caller's request. + */ +static uint64 pg_popcount_choose(const char *buf, int bytes); +static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask); +uint64 (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose; +uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose; + +static inline bool +pg_popcount_sve_available(void) +{ +#ifdef HAVE_ELF_AUX_INFO + unsigned long value; + + return elf_aux_info(AT_HWCAP, &value, sizeof(value)) == 0 && + (value & HWCAP_SVE) != 0; +#elif defined(HAVE_GETAUXVAL) + return (getauxval(AT_HWCAP) & HWCAP_SVE) != 0; +#else + return false; +#endif +} + +static inline void +choose_popcount_functions(void) +{ + if (pg_popcount_sve_available()) + { + pg_popcount_optimized = pg_popcount_sve; + pg_popcount_masked_optimized = pg_popcount_masked_sve; + } + else + { + pg_popcount_optimized = pg_popcount_neon; + pg_popcount_masked_optimized = pg_popcount_masked_neon; + } +} + +static uint64 +pg_popcount_choose(const char *buf, int bytes) +{ + choose_popcount_functions(); + return pg_popcount_optimized(buf, bytes); +} + +static uint64 +pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask) +{ + choose_popcount_functions(); + return pg_popcount_masked_optimized(buf, bytes, mask); +} + +/* + * pg_popcount_sve + * Returns number of 1 bits in buf + */ +pg_attribute_target("arch=armv8-a+sve") +static uint64 +pg_popcount_sve(const char *buf, int bytes) +{ + uint32 vec_len = svcntb(), + bytes_per_iteration = 4 * vec_len; + svuint64_t accum1 = svdup_u64(0), + accum2 = svdup_u64(0), + accum3 = svdup_u64(0), + accum4 = svdup_u64(0); + svbool_t pred = svptrue_b64(); + uint64 popcnt = 0; + + /* + * For better instruction-level parallelism, each loop iteration operates + * on a block of four registers. + */ + for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration) + { + svuint64_t vec; + + vec = svld1(pred, (const uint64 *) buf); + accum1 = svadd_x(pred, accum1, svcnt_x(pred, vec)); + buf += vec_len; + + vec = svld1(pred, (const uint64 *) buf); + accum2 = svadd_x(pred, accum2, svcnt_x(pred, vec)); + buf += vec_len; + + vec = svld1(pred, (const uint64 *) buf); + accum3 = svadd_x(pred, accum3, svcnt_x(pred, vec)); + buf += vec_len; + + vec = svld1(pred, (const uint64 *) buf); + accum4 = svadd_x(pred, accum4, svcnt_x(pred, vec)); + buf += vec_len; + } + + popcnt += svaddv(pred, svadd_x(pred, accum1, accum2)); + popcnt += svaddv(pred, svadd_x(pred, accum3, accum4)); + + /* + * Process any remaining data. + */ + for (; bytes >= vec_len; bytes -= vec_len) + { + svuint8_t vec; + + pred = svwhilelt_b8(0, bytes); + vec = svld1(pred, (const uint8 *) buf); + popcnt += svaddv(pred, svcnt_x(pred, vec)); + buf += vec_len; + } + + return popcnt; +} + +/* + * pg_popcount_masked_sve + * Returns number of 1 bits in buf after applying the mask to each byte + */ +pg_attribute_target("arch=armv8-a+sve") +static uint64 +pg_popcount_masked_sve(const char *buf, int bytes, bits8 mask) +{ + uint32 vec_len = svcntb(), + bytes_per_iteration = 4 * vec_len; + svuint64_t accum1 = svdup_u64(0), + accum2 = svdup_u64(0), + accum3 = svdup_u64(0), + accum4 = svdup_u64(0); + svbool_t pred = svptrue_b64(); + uint64 popcnt = 0, + mask64 = ~UINT64CONST(0) / 0xFF * mask; + + /* + * For better instruction-level parallelism, each loop iteration operates + * on a block of four registers. + */ + for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration) + { + svuint64_t vec; + + vec = svand_x(pred, svld1(pred, (const uint64 *) buf), mask64); + accum1 = svadd_x(pred, accum1, svcnt_x(pred, vec)); + buf += vec_len; + + vec = svand_x(pred, svld1(pred, (const uint64 *) buf), mask64); + accum2 = svadd_x(pred, accum2, svcnt_x(pred, vec)); + buf += vec_len; + + vec = svand_x(pred, svld1(pred, (const uint64 *) buf), mask64); + accum3 = svadd_x(pred, accum3, svcnt_x(pred, vec)); + buf += vec_len; + + vec = svand_x(pred, svld1(pred, (const uint64 *) buf), mask64); + accum4 = svadd_x(pred, accum4, svcnt_x(pred, vec)); + buf += vec_len; + } + + popcnt += svaddv(pred, svadd_x(pred, accum1, accum2)); + popcnt += svaddv(pred, svadd_x(pred, accum3, accum4)); + + /* + * Process any remaining data. + */ + for (; bytes >= vec_len; bytes -= vec_len) + { + svuint8_t vec; + + pred = svwhilelt_b8(0, bytes); + vec = svand_x(pred, svld1(pred, (const uint8 *) buf), mask); + popcnt += svaddv(pred, svcnt_x(pred, vec)); + buf += vec_len; + } + + return popcnt; +} + +#else /* USE_SVE_POPCNT_WITH_RUNTIME_CHECK */ + +/* + * When the SVE version isn't available, there's no point in using function + * pointers to vary the implementation. We instead just make these actual + * external functions when USE_SVE_POPCNT_WITH_RUNTIME_CHECK is not defined. + * The compiler should be able to inline the slow versions here. + */ +uint64 +pg_popcount_optimized(const char *buf, int bytes) +{ + return pg_popcount_neon(buf, bytes); +} + +uint64 +pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask) +{ + return pg_popcount_masked_neon(buf, bytes, mask); +} + +#endif /* ! USE_SVE_POPCNT_WITH_RUNTIME_CHECK */ + /* * pg_popcount32 * Return number of 1 bits in word @@ -39,11 +262,11 @@ pg_popcount64(uint64 word) } /* - * pg_popcount_optimized + * pg_popcount_neon * Returns number of 1 bits in buf */ -uint64 -pg_popcount_optimized(const char *buf, int bytes) +static uint64 +pg_popcount_neon(const char *buf, int bytes) { uint8x16_t vec; uint32 bytes_per_iteration = 4 * sizeof(uint8x16_t); @@ -119,11 +342,11 @@ pg_popcount_optimized(const char *buf, int bytes) } /* - * pg_popcount_masked_optimized + * pg_popcount_masked_neon * Returns number of 1 bits in buf after applying the mask to each byte */ -uint64 -pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask) +static uint64 +pg_popcount_masked_neon(const char *buf, int bytes, bits8 mask) { uint8x16_t vec; uint32 bytes_per_iteration = 4 * sizeof(uint8x16_t); -- 2.39.5 (Apple Git-154)