I've been preparing these for commit, and I've attached what I have so far.
A few notes:

* 0001 just renames the TRY_POPCNT_FAST macro to indicate that it's
  x86_64-specific.  IMO this is worth doing indpendent of this patch set,
  but it's more important with the patch set since we need something
  similar for Aarch64.  I think we should also consider moving the x86_64
  stuff to its own file (perhaps combining it with the AVX-512 stuff), but
  that can probably wait until later.

* 0002 introduces the Neon implementation, which conveniently doesn't need
  configure-time checks or function pointers.  I noticed that some
  compilers (e.g., Apple clang 16) compile in Neon instructions already,
  but our hand-rolled implementation is better about instruction-level
  parallelism and seems to still be quite a bit faster.

* 0003 introduces the SVE implementation.  You'll notice I've moved all the
  function pointer gymnastics into the pg_popcount_aarch64.c file, which is
  where the Neon implementations live, too.  I also tried to clean up the
  configure checks a bit.  I imagine it's possible to make them more
  compact, but I felt that the enhanced readability was worth it.

* For both Neon and SVE, I do see improvements with looping over 4
  registers at a time, so IMHO it's worth doing so even if it performs the
  same as 2-register blocks on some hardware.  I did add a 2-register block
  in the Neon implementation for processing the tail because I was worried
  about its performance on smaller buffers, but that part might get removed
  if I can't measure any difference.

I'm planning to run several more benchmarks, but everything I've seen thus
far has looked pretty good.

-- 
nathan
>From c14a62c26196731aa2379babf535e698260f0066 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nat...@postgresql.org>
Date: Fri, 21 Mar 2025 09:47:30 -0500
Subject: [PATCH v8 1/3] Rename TRY_POPCNT_FAST to POPCNT_X86_64.

---
 src/include/port/pg_bitutils.h |  6 +++---
 src/port/pg_bitutils.c         | 14 +++++++-------
 src/port/pg_popcount_avx512.c  |  8 ++++----
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 62554ce685a..70bf65c04e4 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -294,11 +294,11 @@ pg_ceil_log2_64(uint64 num)
  */
 #ifdef HAVE_X86_64_POPCNTQ
 #if defined(HAVE__GET_CPUID) || defined(HAVE__CPUID)
-#define TRY_POPCNT_FAST 1
+#define POPCNT_X86_64 1
 #endif
 #endif
 
-#ifdef TRY_POPCNT_FAST
+#ifdef POPCNT_X86_64
 /* Attempt to use the POPCNT instruction, but perform a runtime check first */
 extern PGDLLIMPORT int (*pg_popcount32) (uint32 word);
 extern PGDLLIMPORT int (*pg_popcount64) (uint64 word);
@@ -322,7 +322,7 @@ extern int  pg_popcount64(uint64 word);
 extern uint64 pg_popcount_optimized(const char *buf, int bytes);
 extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 
mask);
 
-#endif                                                 /* TRY_POPCNT_FAST */
+#endif                                                 /* POPCNT_X86_64 */
 
 /*
  * Returns the number of 1-bits in buf.
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 5677525693d..34904c2fbd9 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -108,7 +108,7 @@ static inline int pg_popcount64_slow(uint64 word);
 static uint64 pg_popcount_slow(const char *buf, int bytes);
 static uint64 pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask);
 
-#ifdef TRY_POPCNT_FAST
+#ifdef POPCNT_X86_64
 static bool pg_popcount_available(void);
 static int     pg_popcount32_choose(uint32 word);
 static int     pg_popcount64_choose(uint64 word);
@@ -123,9 +123,9 @@ int                 (*pg_popcount32) (uint32 word) = 
pg_popcount32_choose;
 int                    (*pg_popcount64) (uint64 word) = pg_popcount64_choose;
 uint64         (*pg_popcount_optimized) (const char *buf, int bytes) = 
pg_popcount_choose;
 uint64         (*pg_popcount_masked_optimized) (const char *buf, int bytes, 
bits8 mask) = pg_popcount_masked_choose;
-#endif                                                 /* TRY_POPCNT_FAST */
+#endif                                                 /* POPCNT_X86_64 */
 
-#ifdef TRY_POPCNT_FAST
+#ifdef POPCNT_X86_64
 
 /*
  * Return true if CPUID indicates that the POPCNT instruction is available.
@@ -337,7 +337,7 @@ pg_popcount_masked_fast(const char *buf, int bytes, bits8 
mask)
        return popcnt;
 }
 
-#endif                                                 /* TRY_POPCNT_FAST */
+#endif                                                 /* POPCNT_X86_64 */
 
 
 /*
@@ -486,13 +486,13 @@ pg_popcount_masked_slow(const char *buf, int bytes, bits8 
mask)
        return popcnt;
 }
 
-#ifndef TRY_POPCNT_FAST
+#ifndef POPCNT_X86_64
 
 /*
  * When the POPCNT instruction is not available, there's no point in using
  * function pointers to vary the implementation between the fast and slow
  * method.  We instead just make these actual external functions when
- * TRY_POPCNT_FAST is not defined.  The compiler should be able to inline
+ * POPCNT_X86_64 is not defined.  The compiler should be able to inline
  * the slow versions here.
  */
 int
@@ -527,4 +527,4 @@ pg_popcount_masked_optimized(const char *buf, int bytes, 
bits8 mask)
        return pg_popcount_masked_slow(buf, bytes, mask);
 }
 
-#endif                                                 /* !TRY_POPCNT_FAST */
+#endif                                                 /* !POPCNT_X86_64 */
diff --git a/src/port/pg_popcount_avx512.c b/src/port/pg_popcount_avx512.c
index dac895a0fc2..63f697ebea8 100644
--- a/src/port/pg_popcount_avx512.c
+++ b/src/port/pg_popcount_avx512.c
@@ -27,11 +27,11 @@
 #include "port/pg_bitutils.h"
 
 /*
- * It's probably unlikely that TRY_POPCNT_FAST won't be set if we are able to
+ * It's probably unlikely that POPCNT_X86_64 won't be set if we are able to
  * use AVX-512 intrinsics, but we check it anyway to be sure.  We piggy-back on
- * the function pointers that are only used when TRY_POPCNT_FAST is set.
+ * the function pointers that are only used when POPCNT_X86_64 is set.
  */
-#ifdef TRY_POPCNT_FAST
+#ifdef POPCNT_X86_64
 
 /*
  * Does CPUID say there's support for XSAVE instructions?
@@ -219,5 +219,5 @@ pg_popcount_masked_avx512(const char *buf, int bytes, bits8 
mask)
        return _mm512_reduce_add_epi64(accum);
 }
 
-#endif                                                 /* TRY_POPCNT_FAST */
+#endif                                                 /* POPCNT_X86_64 */
 #endif                                                 /* 
USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
-- 
2.39.5 (Apple Git-154)

>From 3ebc1321e6782919980d3410d3bc527fd77751fc Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nat...@postgresql.org>
Date: Fri, 21 Mar 2025 11:04:26 -0500
Subject: [PATCH v8 2/3] Neon popcount support.

---
 src/include/port/pg_bitutils.h |   9 ++
 src/port/Makefile              |   1 +
 src/port/meson.build           |   1 +
 src/port/pg_bitutils.c         |  22 +++-
 src/port/pg_popcount_aarch64.c | 203 +++++++++++++++++++++++++++++++++
 5 files changed, 230 insertions(+), 6 deletions(-)
 create mode 100644 src/port/pg_popcount_aarch64.c

diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 70bf65c04e4..9aa07e5d574 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -298,6 +298,15 @@ pg_ceil_log2_64(uint64 num)
 #endif
 #endif
 
+/*
+ * On AArch64, we can use Neon instructions if the compiler provides access to
+ * them (as indicated by __ARM_NEON).  As in simd.h, we assume that all
+ * available 64-bit hardware has Neon support.
+ */
+#if defined(__aarch64__) && defined(__ARM_NEON)
+#define POPCNT_AARCH64 1
+#endif
+
 #ifdef POPCNT_X86_64
 /* Attempt to use the POPCNT instruction, but perform a runtime check first */
 extern PGDLLIMPORT int (*pg_popcount32) (uint32 word);
diff --git a/src/port/Makefile b/src/port/Makefile
index 4c224319512..cb86b7141e6 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -44,6 +44,7 @@ OBJS = \
        noblock.o \
        path.o \
        pg_bitutils.o \
+       pg_popcount_aarch64.o \
        pg_popcount_avx512.o \
        pg_strong_random.o \
        pgcheckdir.o \
diff --git a/src/port/meson.build b/src/port/meson.build
index 7fcfa728d43..cad0dd8f4f8 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -7,6 +7,7 @@ pgport_sources = [
   'noblock.c',
   'path.c',
   'pg_bitutils.c',
+  'pg_popcount_aarch64.c',
   'pg_popcount_avx512.c',
   'pg_strong_random.c',
   'pgcheckdir.c',
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 34904c2fbd9..8b6f20b54e9 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -103,10 +103,15 @@ const uint8 pg_number_of_ones[256] = {
        4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
 };
 
+/*
+ * If we are building the Neon versions, we don't need the "slow" fallbacks.
+ */
+#ifndef POPCNT_AARCH64
 static inline int pg_popcount32_slow(uint32 word);
 static inline int pg_popcount64_slow(uint64 word);
 static uint64 pg_popcount_slow(const char *buf, int bytes);
 static uint64 pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask);
+#endif
 
 #ifdef POPCNT_X86_64
 static bool pg_popcount_available(void);
@@ -339,6 +344,10 @@ pg_popcount_masked_fast(const char *buf, int bytes, bits8 
mask)
 
 #endif                                                 /* POPCNT_X86_64 */
 
+/*
+ * If we are building the Neon versions, we don't need the "slow" fallbacks.
+ */
+#ifndef POPCNT_AARCH64
 
 /*
  * pg_popcount32_slow
@@ -486,14 +495,15 @@ pg_popcount_masked_slow(const char *buf, int bytes, bits8 
mask)
        return popcnt;
 }
 
-#ifndef POPCNT_X86_64
+#endif                                                 /* ! POPCNT_AARCH64 */
+
+#if !defined(POPCNT_X86_64) && !defined(POPCNT_AARCH64)
 
 /*
- * When the POPCNT instruction is not available, there's no point in using
+ * When special CPU instructions are not available, there's no point in using
  * function pointers to vary the implementation between the fast and slow
- * method.  We instead just make these actual external functions when
- * POPCNT_X86_64 is not defined.  The compiler should be able to inline
- * the slow versions here.
+ * method.  We instead just make these actual external functions.  The compiler
+ * should be able to inline the slow versions here.
  */
 int
 pg_popcount32(uint32 word)
@@ -527,4 +537,4 @@ pg_popcount_masked_optimized(const char *buf, int bytes, 
bits8 mask)
        return pg_popcount_masked_slow(buf, bytes, mask);
 }
 
-#endif                                                 /* !POPCNT_X86_64 */
+#endif                                                 /* ! POPCNT_X86_64 && ! 
POPCNT_AARCH64 */
diff --git a/src/port/pg_popcount_aarch64.c b/src/port/pg_popcount_aarch64.c
new file mode 100644
index 00000000000..426bae660ef
--- /dev/null
+++ b/src/port/pg_popcount_aarch64.c
@@ -0,0 +1,203 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_popcount_aarc64.c
+ *       Holds the AArch64 pg_popcount() implementations.
+ *
+ * Copyright (c) 2025, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *       src/port/pg_popcount_aarch64.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#include "port/pg_bitutils.h"
+
+#ifdef POPCNT_AARCH64
+
+#include <arm_neon.h>
+
+/*
+ * pg_popcount32
+ *             Return number of 1 bits in word
+ */
+int
+pg_popcount32(uint32 word)
+{
+       return pg_popcount64((uint64) word);
+}
+
+/*
+ * pg_popcount64
+ *             Return number of 1 bits in word
+ */
+int
+pg_popcount64(uint64 word)
+{
+       return vaddv_u8(vcnt_u8(vld1_u8((const uint8 *) &word)));
+}
+
+/*
+ * pg_popcount_optimized
+ *             Returns number of 1 bits in buf
+ */
+uint64
+pg_popcount_optimized(const char *buf, int bytes)
+{
+       uint8x16_t      vec;
+       uint32          bytes_per_iteration = 4 * sizeof(uint8x16_t);
+       uint64x2_t      accum1 = vdupq_n_u64(0),
+                               accum2 = vdupq_n_u64(0),
+                               accum3 = vdupq_n_u64(0),
+                               accum4 = vdupq_n_u64(0);
+       uint64          popcnt = 0;
+
+       /*
+        * For better instruction-level parallelism, each loop iteration 
operates
+        * on a block of four registers.
+        */
+       for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration)
+       {
+               vec = vld1q_u8((const uint8 *) buf);
+               accum1 = vpadalq_u32(accum1, 
vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
+               buf += sizeof(uint8x16_t);
+
+               vec = vld1q_u8((const uint8 *) buf);
+               accum2 = vpadalq_u32(accum2, 
vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
+               buf += sizeof(uint8x16_t);
+
+               vec = vld1q_u8((const uint8 *) buf);
+               accum3 = vpadalq_u32(accum3, 
vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
+               buf += sizeof(uint8x16_t);
+
+               vec = vld1q_u8((const uint8 *) buf);
+               accum4 = vpadalq_u32(accum4, 
vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
+               buf += sizeof(uint8x16_t);
+       }
+
+       /*
+        * If enough data remains, do another iteration on a block of two
+        * registers.
+        */
+       bytes_per_iteration = 2 * sizeof(uint8x16_t);
+       if (bytes >= bytes_per_iteration)
+       {
+               vec = vld1q_u8((const uint8 *) buf);
+               accum1 = vpadalq_u32(accum1, 
vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
+               buf += sizeof(uint8x16_t);
+
+               vec = vld1q_u8((const uint8 *) buf);
+               accum2 = vpadalq_u32(accum2, 
vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
+               buf += sizeof(uint8x16_t);
+
+               bytes -= bytes_per_iteration;
+       }
+
+       /*
+        * Add the accumulators.
+        */
+       popcnt += vaddvq_u64(vaddq_u64(accum1, accum2));
+       popcnt += vaddvq_u64(vaddq_u64(accum3, accum4));
+
+       /*
+        * Process remaining 8-byte blocks.
+        */
+       for (; bytes >= sizeof(uint64); bytes -= sizeof(uint64))
+       {
+               popcnt += pg_popcount64(*((uint64 *) buf));
+               buf += sizeof(uint64);
+       }
+
+       /*
+        * Process any remaining data byte-by-byte.
+        */
+       while (bytes--)
+               popcnt += pg_number_of_ones[(unsigned char) *buf++];
+
+       return popcnt;
+}
+
+/*
+ * pg_popcount_masked_optimized
+ *             Returns number of 1 bits in buf after applying the mask to each 
byte
+ */
+uint64
+pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask)
+{
+       uint8x16_t      vec;
+       uint32          bytes_per_iteration = 4 * sizeof(uint8x16_t);
+       uint64x2_t      accum1 = vdupq_n_u64(0),
+                               accum2 = vdupq_n_u64(0),
+                               accum3 = vdupq_n_u64(0),
+                               accum4 = vdupq_n_u64(0);
+       uint64          popcnt = 0,
+                               mask64 = ~UINT64CONST(0) / 0xFF * mask;
+       uint8x16_t      maskv = vdupq_n_u8(mask);
+
+       /*
+        * For better instruction-level parallelism, each loop iteration 
operates
+        * on a block of four registers.
+        */
+       for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration)
+       {
+               vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv);
+               accum1 = vpadalq_u32(accum1, 
vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
+               buf += sizeof(uint8x16_t);
+
+               vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv);
+               accum2 = vpadalq_u32(accum2, 
vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
+               buf += sizeof(uint8x16_t);
+
+               vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv);
+               accum3 = vpadalq_u32(accum3, 
vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
+               buf += sizeof(uint8x16_t);
+
+               vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv);
+               accum4 = vpadalq_u32(accum4, 
vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
+               buf += sizeof(uint8x16_t);
+       }
+
+       /*
+        * If enough data remains, do another iteration on a block of two
+        * registers.
+        */
+       bytes_per_iteration = 2 * sizeof(uint8x16_t);
+       if (bytes >= bytes_per_iteration)
+       {
+               vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv);
+               accum1 = vpadalq_u32(accum1, 
vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
+               buf += sizeof(uint8x16_t);
+
+               vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv);
+               accum2 = vpadalq_u32(accum2, 
vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
+               buf += sizeof(uint8x16_t);
+
+               bytes -= bytes_per_iteration;
+       }
+
+       /*
+        * Add the accumulators.
+        */
+       popcnt += vaddvq_u64(vaddq_u64(accum1, accum2));
+       popcnt += vaddvq_u64(vaddq_u64(accum3, accum4));
+
+       /*
+        * Process remining 8-byte blocks.
+        */
+       for (; bytes >= sizeof(uint64); bytes -= sizeof(uint64))
+       {
+               popcnt += pg_popcount64(*((uint64 *) buf) & mask64);
+               buf += sizeof(uint64);
+       }
+
+       /*
+        * Process any remaining data byte-by-byte.
+        */
+       while (bytes--)
+               popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask];
+
+       return popcnt;
+}
+
+#endif                                                 /* POPCNT_AARCH64 */
-- 
2.39.5 (Apple Git-154)

>From 36f954a5735911af3e057f24d8803c32819e738d Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nat...@postgresql.org>
Date: Fri, 21 Mar 2025 20:24:44 -0500
Subject: [PATCH v8 3/3] SVE popcount support.

---
 config/c-compiler.m4           |  64 +++++++++
 configure                      |  84 ++++++++++++
 configure.ac                   |   9 ++
 meson.build                    |  61 +++++++++
 src/include/pg_config.h.in     |   3 +
 src/include/port/pg_bitutils.h |  17 +++
 src/port/pg_popcount_aarch64.c | 235 ++++++++++++++++++++++++++++++++-
 7 files changed, 467 insertions(+), 6 deletions(-)

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 3712e81e38c..d1e7461f6f6 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -708,3 +708,67 @@ if test x"$Ac_cachevar" = x"yes"; then
 fi
 undefine([Ac_cachevar])dnl
 ])# PGAC_AVX512_POPCNT_INTRINSICS
+
+# PGAC_SVE_POPCNT_INTRINSICS
+# --------------------------
+# Check if the compiler supports the SVE popcount instructions using the
+# svptrue_b64, svdup_u64, svcntb, svld1, svadd_x, svcnt_x, svaddv,
+# svwhilelt_b8, and svand_x intrinsic functions.
+#
+# If the intrinsics are supported, sets pgac_sve_popcnt_intrinsics.
+AC_DEFUN([PGAC_SVE_POPCNT_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sve_popcnt_intrinsics])])dnl
+AC_CACHE_CHECK([for svcnt_x], [Ac_cachevar],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include <arm_sve.h>
+
+       char buf[500];
+
+       #if defined(__has_attribute) && __has_attribute (target)
+       __attribute__((target("arch=armv8-a+sve")))
+       #endif
+       static int popcount_test(void)
+       {
+               uint32_t        vec_len = svcntb();
+               int                     bytes = sizeof(buf);
+               svuint64_t      accum1 = svdup_u64(0),
+                                       accum2 = svdup_u64(0);
+               svbool_t        pred = svptrue_b64();
+               uint64_t        popcnt = 0,
+                                       mask = 0x5555555555555555;
+               char       *p = buf;
+
+               for (; bytes >= vec_len * 2; bytes -= vec_len * 2)
+               {
+                       svuint64_t  vec;
+
+                       vec = svand_x(pred, svld1(pred, (const uint64_t *) p), 
mask);
+                       accum1 = svadd_x(pred, accum1, svcnt_x(pred, vec));
+                       p += vec_len;
+
+                       vec = svand_x(pred, svld1(pred, (const uint64_t *) p), 
mask);
+                       accum2 = svadd_x(pred, accum2, svcnt_x(pred, vec));
+                       p += vec_len;
+               }
+
+               popcnt += svaddv(pred, svadd_x(pred, accum1, accum2));
+
+               for (; bytes >= vec_len; bytes -= vec_len)
+               {
+                       svuint8_t   vec;
+
+                       pred = svwhilelt_b8(0, bytes);
+                       vec = svand_x(pred, svld1(pred, (const uint8_t *) p), 
0x55);
+                       popcnt += svaddv(pred, svcnt_x(pred, vec));
+                       p += vec_len;
+               }
+
+               return (int) popcnt;
+       }]],
+  [return popcount_test();])],
+  [Ac_cachevar=yes],
+  [Ac_cachevar=no])])
+if test x"$Ac_cachevar" = x"yes"; then
+  pgac_sve_popcnt_intrinsics=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_SVE_POPCNT_INTRINSICS
diff --git a/configure b/configure
index fac1e9a4e39..85f4b24caaa 100755
--- a/configure
+++ b/configure
@@ -17378,6 +17378,90 @@ $as_echo "#define USE_AVX512_POPCNT_WITH_RUNTIME_CHECK 
1" >>confdefs.h
   fi
 fi
 
+# Check for SVE popcount intrinsics
+#
+if test x"$host_cpu" = x"aarch64"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for svcnt_x" >&5
+$as_echo_n "checking for svcnt_x... " >&6; }
+if ${pgac_cv_sve_popcnt_intrinsics+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <arm_sve.h>
+
+       char buf[500];
+
+       #if defined(__has_attribute) && __has_attribute (target)
+       __attribute__((target("arch=armv8-a+sve")))
+       #endif
+       static int popcount_test(void)
+       {
+               uint32_t        vec_len = svcntb();
+               int                     bytes = sizeof(buf);
+               svuint64_t      accum1 = svdup_u64(0),
+                                       accum2 = svdup_u64(0);
+               svbool_t        pred = svptrue_b64();
+               uint64_t        popcnt = 0,
+                                       mask = 0x5555555555555555;
+               char       *p = buf;
+
+               for (; bytes >= vec_len * 2; bytes -= vec_len * 2)
+               {
+                       svuint64_t  vec;
+
+                       vec = svand_x(pred, svld1(pred, (const uint64_t *) p), 
mask);
+                       accum1 = svadd_x(pred, accum1, svcnt_x(pred, vec));
+                       p += vec_len;
+
+                       vec = svand_x(pred, svld1(pred, (const uint64_t *) p), 
mask);
+                       accum2 = svadd_x(pred, accum2, svcnt_x(pred, vec));
+                       p += vec_len;
+               }
+
+               popcnt += svaddv(pred, svadd_x(pred, accum1, accum2));
+
+               for (; bytes >= vec_len; bytes -= vec_len)
+               {
+                       svuint8_t   vec;
+
+                       pred = svwhilelt_b8(0, bytes);
+                       vec = svand_x(pred, svld1(pred, (const uint8_t *) p), 
0x55);
+                       popcnt += svaddv(pred, svcnt_x(pred, vec));
+                       p += vec_len;
+               }
+
+               return (int) popcnt;
+       }
+int
+main ()
+{
+return popcount_test();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_sve_popcnt_intrinsics=yes
+else
+  pgac_cv_sve_popcnt_intrinsics=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: 
$pgac_cv_sve_popcnt_intrinsics" >&5
+$as_echo "$pgac_cv_sve_popcnt_intrinsics" >&6; }
+if test x"$pgac_cv_sve_popcnt_intrinsics" = x"yes"; then
+  pgac_sve_popcnt_intrinsics=yes
+fi
+
+  if test x"$pgac_sve_popcnt_intrinsics" = x"yes"; then
+
+$as_echo "#define USE_SVE_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+  fi
+fi
+
 # Check for Intel SSE 4.2 intrinsics to do CRC calculations.
 #
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and 
_mm_crc32_u32" >&5
diff --git a/configure.ac b/configure.ac
index b6d02f5ecc7..64b52940658 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2057,6 +2057,15 @@ if test x"$host_cpu" = x"x86_64"; then
   fi
 fi
 
+# Check for SVE popcount intrinsics
+#
+if test x"$host_cpu" = x"aarch64"; then
+  PGAC_SVE_POPCNT_INTRINSICS()
+  if test x"$pgac_sve_popcnt_intrinsics" = x"yes"; then
+    AC_DEFINE(USE_SVE_POPCNT_WITH_RUNTIME_CHECK, 1, [Define to 1 to use SVE 
popcount instructions with a runtime check.])
+  fi
+fi
+
 # Check for Intel SSE 4.2 intrinsics to do CRC calculations.
 #
 PGAC_SSE42_CRC32_INTRINSICS()
diff --git a/meson.build b/meson.build
index 7cf518a2765..de7e695ab6f 100644
--- a/meson.build
+++ b/meson.build
@@ -2285,6 +2285,67 @@ int main(void)
 endif
 
 
+###############################################################
+# Check for the availability of SVE popcount intrinsics.
+###############################################################
+
+if host_cpu == 'aarch64'
+
+  prog = '''
+#include <arm_sve.h>
+
+char buf[500];
+
+#if defined(__has_attribute) && __has_attribute (target)
+__attribute__((target("arch=armv8-a+sve")))
+#endif
+int main(void)
+{
+       uint32_t        vec_len = svcntb();
+       int                     bytes = sizeof(buf);
+       svuint64_t      accum1 = svdup_u64(0),
+                               accum2 = svdup_u64(0);
+       svbool_t        pred = svptrue_b64();
+       uint64_t        popcnt = 0,
+                               mask = 0x5555555555555555;
+       char       *p = buf;
+
+       for (; bytes >= vec_len * 2; bytes -= vec_len * 2)
+       {
+               svuint64_t      vec;
+
+               vec = svand_x(pred, svld1(pred, (const uint64_t *) p), mask);
+               accum1 = svadd_x(pred, accum1, svcnt_x(pred, vec));
+               p += vec_len;
+
+               vec = svand_x(pred, svld1(pred, (const uint64_t *) p), mask);
+               accum2 = svadd_x(pred, accum2, svcnt_x(pred, vec));
+               p += vec_len;
+       }
+
+       popcnt += svaddv(pred, svadd_x(pred, accum1, accum2));
+
+       for (; bytes >= vec_len; bytes -= vec_len)
+       {
+               svuint8_t       vec;
+
+               pred = svwhilelt_b8(0, bytes);
+               vec = svand_x(pred, svld1(pred, (const uint8_t *) p), 0x55);
+               popcnt += svaddv(pred, svcnt_x(pred, vec));
+               p += vec_len;
+       }
+
+       return (int) popcnt;
+}
+'''
+
+  if cc.links(prog, name: 'SVE popcount', args: test_c_args)
+    cdata.set('USE_SVE_POPCNT_WITH_RUNTIME_CHECK', 1)
+  endif
+
+endif
+
+
 ###############################################################
 # Select CRC-32C implementation.
 #
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index db6454090d2..2a67db077a9 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -706,6 +706,9 @@
 /* Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check. */
 #undef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
 
+/* Define to 1 to use SVE popcount instructions with a runtime check. */
+#undef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
+
 /* Define to build with systemd support. (--with-systemd) */
 #undef USE_SYSTEMD
 
diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 9aa07e5d574..1bcb4ecb8ab 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -324,6 +324,23 @@ extern uint64 pg_popcount_avx512(const char *buf, int 
bytes);
 extern uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 
mask);
 #endif
 
+#elif POPCNT_AARCH64
+/* Use the Neon version of pg_popcount{32,64} without function pointer. */
+extern int     pg_popcount32(uint32 word);
+extern int     pg_popcount64(uint64 word);
+
+/*
+ * We can try to use an SVE-optimized pg_popcount() on some systems  For that,
+ * we do use a function pointer.
+ */
+#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
+extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int 
bytes);
+extern PGDLLIMPORT uint64 (*pg_popcount_masked_optimized) (const char *buf, 
int bytes, bits8 mask);
+#else
+extern uint64 pg_popcount_optimized(const char *buf, int bytes);
+extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 
mask);
+#endif
+
 #else
 /* Use a portable implementation -- no need for a function pointer. */
 extern int     pg_popcount32(uint32 word);
diff --git a/src/port/pg_popcount_aarch64.c b/src/port/pg_popcount_aarch64.c
index 426bae660ef..48441269639 100644
--- a/src/port/pg_popcount_aarch64.c
+++ b/src/port/pg_popcount_aarch64.c
@@ -18,6 +18,229 @@
 
 #include <arm_neon.h>
 
+#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
+#include <arm_sve.h>
+
+#if defined(HAVE_ELF_AUX_INFO) || defined(HAVE_GETAUXVAL)
+#include <sys/auxv.h>
+#endif
+#endif
+
+/*
+ * The Neon versions are built regardless of whether we are building the SVE
+ * versions.
+ */
+static uint64 pg_popcount_neon(const char *buf, int bytes);
+static uint64 pg_popcount_masked_neon(const char *buf, int bytes, bits8 mask);
+
+#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
+
+/*
+ * These are the SVE implementations of the popcount functions.
+ */
+static uint64 pg_popcount_sve(const char *buf, int bytes);
+static uint64 pg_popcount_masked_sve(const char *buf, int bytes, bits8 mask);
+
+/*
+ * The function pointers are initially set to "choose" functions.  These
+ * functions will first set the pointers to the right implementations (based on
+ * what the current CPU supports) and then will call the pointer to fulfill the
+ * caller's request.
+ */
+static uint64 pg_popcount_choose(const char *buf, int bytes);
+static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 
mask);
+uint64         (*pg_popcount_optimized) (const char *buf, int bytes) = 
pg_popcount_choose;
+uint64         (*pg_popcount_masked_optimized) (const char *buf, int bytes, 
bits8 mask) = pg_popcount_masked_choose;
+
+static inline bool
+pg_popcount_sve_available(void)
+{
+#ifdef HAVE_ELF_AUX_INFO
+       unsigned long value;
+
+       return elf_aux_info(AT_HWCAP, &value, sizeof(value)) == 0 &&
+               (value & HWCAP_SVE) != 0;
+#elif defined(HAVE_GETAUXVAL)
+       return (getauxval(AT_HWCAP) & HWCAP_SVE) != 0;
+#else
+       return false;
+#endif
+}
+
+static inline void
+choose_popcount_functions(void)
+{
+       if (pg_popcount_sve_available())
+       {
+               pg_popcount_optimized = pg_popcount_sve;
+               pg_popcount_masked_optimized = pg_popcount_masked_sve;
+       }
+       else
+       {
+               pg_popcount_optimized = pg_popcount_neon;
+               pg_popcount_masked_optimized = pg_popcount_masked_neon;
+       }
+}
+
+static uint64
+pg_popcount_choose(const char *buf, int bytes)
+{
+       choose_popcount_functions();
+       return pg_popcount_optimized(buf, bytes);
+}
+
+static uint64
+pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask)
+{
+       choose_popcount_functions();
+       return pg_popcount_masked_optimized(buf, bytes, mask);
+}
+
+/*
+ * pg_popcount_sve
+ *             Returns number of 1 bits in buf
+ */
+pg_attribute_target("arch=armv8-a+sve")
+static uint64
+pg_popcount_sve(const char *buf, int bytes)
+{
+       uint32          vec_len = svcntb(),
+                               bytes_per_iteration = 4 * vec_len;
+       svuint64_t      accum1 = svdup_u64(0),
+                               accum2 = svdup_u64(0),
+                               accum3 = svdup_u64(0),
+                               accum4 = svdup_u64(0);
+       svbool_t        pred = svptrue_b64();
+       uint64          popcnt = 0;
+
+       /*
+        * For better instruction-level parallelism, each loop iteration 
operates
+        * on a block of four registers.
+        */
+       for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration)
+       {
+               svuint64_t      vec;
+
+               vec = svld1(pred, (const uint64 *) buf);
+               accum1 = svadd_x(pred, accum1, svcnt_x(pred, vec));
+               buf += vec_len;
+
+               vec = svld1(pred, (const uint64 *) buf);
+               accum2 = svadd_x(pred, accum2, svcnt_x(pred, vec));
+               buf += vec_len;
+
+               vec = svld1(pred, (const uint64 *) buf);
+               accum3 = svadd_x(pred, accum3, svcnt_x(pred, vec));
+               buf += vec_len;
+
+               vec = svld1(pred, (const uint64 *) buf);
+               accum4 = svadd_x(pred, accum4, svcnt_x(pred, vec));
+               buf += vec_len;
+       }
+
+       popcnt += svaddv(pred, svadd_x(pred, accum1, accum2));
+       popcnt += svaddv(pred, svadd_x(pred, accum3, accum4));
+
+       /*
+        * Process any remaining data.
+        */
+       for (; bytes >= vec_len; bytes -= vec_len)
+       {
+               svuint8_t       vec;
+
+               pred = svwhilelt_b8(0, bytes);
+               vec = svld1(pred, (const uint8 *) buf);
+               popcnt += svaddv(pred, svcnt_x(pred, vec));
+               buf += vec_len;
+       }
+
+       return popcnt;
+}
+
+/*
+ * pg_popcount_masked_sve
+ *             Returns number of 1 bits in buf after applying the mask to each 
byte
+ */
+pg_attribute_target("arch=armv8-a+sve")
+static uint64
+pg_popcount_masked_sve(const char *buf, int bytes, bits8 mask)
+{
+       uint32          vec_len = svcntb(),
+                               bytes_per_iteration = 4 * vec_len;
+       svuint64_t      accum1 = svdup_u64(0),
+                               accum2 = svdup_u64(0),
+                               accum3 = svdup_u64(0),
+                               accum4 = svdup_u64(0);
+       svbool_t        pred = svptrue_b64();
+       uint64          popcnt = 0,
+                               mask64 = ~UINT64CONST(0) / 0xFF * mask;
+
+       /*
+        * For better instruction-level parallelism, each loop iteration 
operates
+        * on a block of four registers.
+        */
+       for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration)
+       {
+               svuint64_t      vec;
+
+               vec = svand_x(pred, svld1(pred, (const uint64 *) buf), mask64);
+               accum1 = svadd_x(pred, accum1, svcnt_x(pred, vec));
+               buf += vec_len;
+
+               vec = svand_x(pred, svld1(pred, (const uint64 *) buf), mask64);
+               accum2 = svadd_x(pred, accum2, svcnt_x(pred, vec));
+               buf += vec_len;
+
+               vec = svand_x(pred, svld1(pred, (const uint64 *) buf), mask64);
+               accum3 = svadd_x(pred, accum3, svcnt_x(pred, vec));
+               buf += vec_len;
+
+               vec = svand_x(pred, svld1(pred, (const uint64 *) buf), mask64);
+               accum4 = svadd_x(pred, accum4, svcnt_x(pred, vec));
+               buf += vec_len;
+       }
+
+       popcnt += svaddv(pred, svadd_x(pred, accum1, accum2));
+       popcnt += svaddv(pred, svadd_x(pred, accum3, accum4));
+
+       /*
+        * Process any remaining data.
+        */
+       for (; bytes >= vec_len; bytes -= vec_len)
+       {
+               svuint8_t       vec;
+
+               pred = svwhilelt_b8(0, bytes);
+               vec = svand_x(pred, svld1(pred, (const uint8 *) buf), mask);
+               popcnt += svaddv(pred, svcnt_x(pred, vec));
+               buf += vec_len;
+       }
+
+       return popcnt;
+}
+
+#else                                                  /* 
USE_SVE_POPCNT_WITH_RUNTIME_CHECK */
+
+/*
+ * When the SVE version isn't available, there's no point in using function
+ * pointers to vary the implementation.  We instead just make these actual
+ * external functions when USE_SVE_POPCNT_WITH_RUNTIME_CHECK is not defined.
+ * The compiler should be able to inline the slow versions here.
+ */
+uint64
+pg_popcount_optimized(const char *buf, int bytes)
+{
+       return pg_popcount_neon(buf, bytes);
+}
+
+uint64
+pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask)
+{
+       return pg_popcount_masked_neon(buf, bytes, mask);
+}
+
+#endif                                                 /* ! 
USE_SVE_POPCNT_WITH_RUNTIME_CHECK */
+
 /*
  * pg_popcount32
  *             Return number of 1 bits in word
@@ -39,11 +262,11 @@ pg_popcount64(uint64 word)
 }
 
 /*
- * pg_popcount_optimized
+ * pg_popcount_neon
  *             Returns number of 1 bits in buf
  */
-uint64
-pg_popcount_optimized(const char *buf, int bytes)
+static uint64
+pg_popcount_neon(const char *buf, int bytes)
 {
        uint8x16_t      vec;
        uint32          bytes_per_iteration = 4 * sizeof(uint8x16_t);
@@ -119,11 +342,11 @@ pg_popcount_optimized(const char *buf, int bytes)
 }
 
 /*
- * pg_popcount_masked_optimized
+ * pg_popcount_masked_neon
  *             Returns number of 1 bits in buf after applying the mask to each 
byte
  */
-uint64
-pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask)
+static uint64
+pg_popcount_masked_neon(const char *buf, int bytes, bits8 mask)
 {
        uint8x16_t      vec;
        uint32          bytes_per_iteration = 4 * sizeof(uint8x16_t);
-- 
2.39.5 (Apple Git-154)

Reply via email to