From 4422558d6ce777bd46283ac772f5b59f67d0011f Mon Sep 17 00:00:00 2001
From: Chiranmoy Bhattacharya <chiranmoy.bhattacharya@fujitsu.com>
Date: Tue, 10 Dec 2024 13:53:20 +0530
Subject: [PATCH v2] SVE support for popcount and popcount masked

---
 config/c-compiler.m4              |  41 +++++++++
 configure                         |  93 +++++++++++++++++++++
 configure.ac                      |  16 ++++
 meson.build                       |  31 +++++++
 src/Makefile.global.in            |   4 +
 src/include/pg_config.h.in        |   3 +
 src/include/port/pg_bitutils.h    |  14 ++++
 src/makefiles/meson.build         |   3 +-
 src/port/Makefile                 |  11 +++
 src/port/meson.build              |   4 +-
 src/port/pg_bitutils.c            |  10 ++-
 src/port/pg_popcount_sve.c        | 134 ++++++++++++++++++++++++++++++
 src/port/pg_popcount_sve_choose.c |  32 +++++++
 13 files changed, 393 insertions(+), 3 deletions(-)
 create mode 100644 src/port/pg_popcount_sve.c
 create mode 100644 src/port/pg_popcount_sve_choose.c

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index e112fd45d4..eabe68a773 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -704,3 +704,44 @@ if test x"$Ac_cachevar" = x"yes"; then
 fi
 undefine([Ac_cachevar])dnl
 ])# PGAC_AVX512_POPCNT_INTRINSICS
+
+# PGAC_ARM_SVE_POPCNT_INTRINSICS
+# ------------------------------
+# Check if the compiler supports the ARM SVE popcount instructions using the
+# svdup_u64, svptrue_b64, svcnt_z, svcnt_x, svadd_x, svaddv, and svwhilelt_b8
+# intrinsic functions.
+#
+# Optional compiler flags can be passed as arguments (e.g., -march=armv8-a+sve).
+AC_DEFUN([PGAC_ARM_SVE_POPCNT_INTRINSICS],
+[
+  AC_CACHE_CHECK([for svdup_u64 and other intrinsics with CFLAGS=$1],
+                 [pgac_cv_arm_sve_popcnt_intrinsics],
+  [
+    pgac_save_CFLAGS=$CFLAGS
+    CFLAGS="$pgac_save_CFLAGS $1"
+
+    AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <arm_sve.h>],
+    [svbool_t predicate = svptrue_b64();
+     svuint64_t segment = svdup_u64(0), accum = svdup_u64(0);
+     const char *buf = NULL; /* Simulating a buffer pointer */
+     uint32_t num_vals_segment = svlen_u64(segment);
+
+     /* Using intrinsics as per the code */
+     predicate = svwhilelt_b8(0, 128);
+     segment = svld1(predicate, (const uint64_t *)buf);
+     accum = svadd_x(predicate, accum, svcnt_x(predicate, segment));
+     uint64_t popcnt = svaddv(predicate, accum);
+
+     /* Return computed value, to prevent the above being optimized away */
+     return popcnt;])],
+    [pgac_cv_arm_sve_popcnt_intrinsics=yes],
+    [pgac_cv_arm_sve_popcnt_intrinsics=no])
+
+    CFLAGS="$pgac_save_CFLAGS"
+  ])
+
+  if test x"$pgac_cv_arm_sve_popcnt_intrinsics" = x"yes"; then
+    CFLAGS_POPCNT_ARM="$1"
+    pgac_arm_sve_popcnt_intrinsics=yes
+  fi
+])
diff --git a/configure b/configure
index 518c33b73a..a3e41459d5 100755
--- a/configure
+++ b/configure
@@ -647,6 +647,8 @@ MSGFMT_FLAGS
 MSGFMT
 PG_CRC32C_OBJS
 CFLAGS_CRC
+PG_POPCNT_OBJS_ARM
+CFLAGS_POPCNT_ARM
 LIBOBJS
 OPENSSL
 ZSTD
@@ -17159,6 +17161,97 @@ $as_echo "#define USE_AVX512_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h
   fi
 fi
 
+# Check for ARM SVE popcount intrinsics
+CFLAGS_POPCNT_ARM=""
+PG_POPCNT_OBJS_ARM=""
+
+if test x"$host_cpu" = x"aarch64"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for svcnt_u64 with CFLAGS=" >&5
+$as_echo_n "checking for svcnt_u64 with CFLAGS=... " >&6; }
+if ${pgac_cv_arm_sve_popcnt_intrinsics_+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CFLAGS=$CFLAGS
+  CFLAGS="$pgac_save_CFLAGS "
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <arm_sve.h>
+int
+main ()
+{
+    svbool_t predicate = svptrue_b64();
+    svuint64_t segment, accum = svdup_u64(0);
+    uint64_t numVals = svlen_u64(segment);
+
+    svuint64_t counts = svcnt_u64_z(predicate, segment);
+    accum = svadd_u64_m(predicate, accum, counts);
+    return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_arm_sve_popcnt_intrinsics_=yes
+else
+  pgac_cv_arm_sve_popcnt_intrinsics_=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_arm_sve_popcnt_intrinsics_" >&5
+$as_echo "$pgac_cv_arm_sve_popcnt_intrinsics_" >&6; }
+if test x"$pgac_cv_arm_sve_popcnt_intrinsics_" = x"yes"; then
+  CFLAGS_POPCNT_ARM=""
+  pgac_arm_sve_popcnt_intrinsics=yes
+fi
+
+if test x"$pgac_arm_sve_popcnt_intrinsics" != x"yes"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for svcnt_u64 with CFLAGS=-march=armv8-a+sve" >&5
+$as_echo_n "checking for svcnt_u64 with CFLAGS=-march=armv8-a+sve... " >&6; }
+if ${pgac_cv_arm_sve_popcnt_intrinsics__march_armv8_a_sve+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CFLAGS=$CFLAGS
+  CFLAGS="$pgac_save_CFLAGS -march=armv8-a+sve"
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <arm_sve.h>
+int
+main ()
+{
+    svbool_t predicate = svptrue_b64();
+    svuint64_t segment, accum = svdup_u64(0);
+    uint64_t numVals = svlen_u64(segment);
+
+    svuint64_t counts = svcnt_u64_z(predicate, segment);
+    accum = svadd_u64_m(predicate, accum, counts);
+    return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_arm_sve_popcnt_intrinsics__march_armv8_a_sve=yes
+else
+  pgac_cv_arm_sve_popcnt_intrinsics__march_armv8_a_sve=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_arm_sve_popcnt_intrinsics__march_armv8_a_sve" >&5
+$as_echo "$pgac_cv_arm_sve_popcnt_intrinsics__march_armv8_a_sve" >&6; }
+if test x"$pgac_cv_arm_sve_popcnt_intrinsics__march_armv8_a_sve" = x"yes"; then
+  CFLAGS_POPCNT_ARM="-march=armv8-a+sve"
+  pgac_arm_sve_popcnt_intrinsics=yes
+fi
+
+fi
+if test x"$pgac_arm_sve_popcnt_intrinsics" = x"yes"; then
+  PG_POPCNT_OBJS_ARM="pg_popcount_sve.o pg_popcount_sve_choose.o"
+
+  $as_echo "#define USE_SVE_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+fi
+fi
+
 # Check for Intel SSE 4.2 intrinsics to do CRC calculations.
 #
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32" >&5
diff --git a/configure.ac b/configure.ac
index 247ae97fa4..1ea314190b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2021,6 +2021,22 @@ if test x"$host_cpu" = x"x86_64"; then
   fi
 fi
 
+# Check for ARM popcount intrinsics
+CFLAGS_POPCNT_ARM=""
+PG_POPCNT_OBJS_ARM=""
+if test x"$host_cpu" = x"aarch64"; then
+  PGAC_ARM_SVE_POPCNT_INTRINSICS([])
+  if test x"$pgac_arm_sve_popcnt_intrinsics" != x"yes"; then
+    PGAC_ARM_SVE_POPCNT_INTRINSICS([-march=armv8-a+sve])
+  fi
+  if test x"$pgac_arm_sve_popcnt_intrinsics" = x"yes"; then
+    PG_POPCNT_OBJS_ARM="pg_popcount_sve.o pg_popcount_sve_choose.o"
+    AC_DEFINE(USE_SVE_POPCNT_WITH_RUNTIME_CHECK, 1, [Define to 1 to use ARM popcount instructions.])
+  fi
+fi
+AC_SUBST(CFLAGS_POPCNT_ARM)
+AC_SUBST(PG_POPCNT_OBJS_ARM)
+
 # Check for Intel SSE 4.2 intrinsics to do CRC calculations.
 #
 PGAC_SSE42_CRC32_INTRINSICS()
diff --git a/meson.build b/meson.build
index e5ce437a5c..6c936f2f2b 100644
--- a/meson.build
+++ b/meson.build
@@ -2191,6 +2191,37 @@ int main(void)
 endif
 
 
+###############################################################
+# Check for the availability of ARM SVE popcount intrinsics.
+###############################################################
+
+cflags_popcnt_arm = []
+if host_cpu == 'aarch64'
+
+  prog = '''
+#include <arm_sve.h>
+
+int main(void)
+{
+    const svuint64_t val = svdup_u64(0xFFFFFFFFFFFFFFFF);
+    svuint64_t popcnt = svcntb(val);
+    /* return computed value, to prevent the above being optimized away */
+    return popcnt == 0;
+}
+'''
+
+  if cc.links(prog, name: 'ARM SVE popcount without -march=armv8-a+sve',
+        args: test_c_args + ['-DSVINT64=@0@'.format(cdata.get('SV_INT64_TYPE'))])
+    cdata.set('USE_SVE_POPCNT_WITH_RUNTIME_CHECK', 1)
+  elif cc.links(prog, name: 'ARM SVE popcount with -march=armv8-a+sve',
+        args: test_c_args + ['-DSVINT64=@0@'.format(cdata.get('SV_INT64_TYPE'))] + ['-march=armv8-a+sve'])
+    cdata.set('USE_SVE_POPCNT_WITH_RUNTIME_CHECK', 1)
+    cflags_popcnt_arm += ['-march=armv8-a+sve']
+  endif
+
+endif
+
+
 ###############################################################
 # Select CRC-32C implementation.
 #
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index eac3d00121..2c32dfab5e 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -262,6 +262,7 @@ CFLAGS_SL_MODULE = @CFLAGS_SL_MODULE@
 CXXFLAGS_SL_MODULE = @CXXFLAGS_SL_MODULE@
 CFLAGS_UNROLL_LOOPS = @CFLAGS_UNROLL_LOOPS@
 CFLAGS_VECTORIZE = @CFLAGS_VECTORIZE@
+CFLAGS_POPCNT_ARM = @CFLAGS_POPCNT_ARM@
 CFLAGS_CRC = @CFLAGS_CRC@
 PERMIT_DECLARATION_AFTER_STATEMENT = @PERMIT_DECLARATION_AFTER_STATEMENT@
 PERMIT_MISSING_VARIABLE_DECLARATIONS = @PERMIT_MISSING_VARIABLE_DECLARATIONS@
@@ -770,6 +771,9 @@ LIBOBJS = @LIBOBJS@
 # files needed for the chosen CRC-32C implementation
 PG_CRC32C_OBJS = @PG_CRC32C_OBJS@
 
+# files needed for the chosen popcount implementation
+PG_POPCNT_OBJS_ARM = @PG_POPCNT_OBJS_ARM@
+
 LIBS := -lpgcommon -lpgport $(LIBS)
 
 # to make ws2_32.lib the last library
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 07b2f798ab..29c32bbbbe 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -648,6 +648,9 @@
 /* Define to 1 to use AVX-512 popcount instructions with a runtime check. */
 #undef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
 
+/* Define to 1 to use SVE popcount instructions with a runtime check. */
+#undef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
+
 /* Define to 1 to build with Bonjour support. (--with-bonjour) */
 #undef USE_BONJOUR
 
diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index a3cad46afe..57ebfddb7d 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -298,6 +298,14 @@ pg_ceil_log2_64(uint64 num)
 #endif
 #endif
 
+/*
+ * On AArch64 builds, try using SVE popcount instructions, but only if
+ * we can verify that the CPU supports it via a runtime check.
+ */
+#if defined(USE_SVE_POPCNT_WITH_RUNTIME_CHECK)
+#define TRY_POPCNT_FAST 1
+#endif
+
 #ifdef TRY_POPCNT_FAST
 /* Attempt to use the POPCNT instruction, but perform a runtime check first */
 extern PGDLLIMPORT int (*pg_popcount32) (uint32 word);
@@ -317,6 +325,12 @@ extern uint64 pg_popcount_avx512(const char *buf, int bytes);
 extern uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask);
 #endif
 
+#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
+extern bool pg_popcount_sve_available(void);
+extern uint64 pg_popcount_sve(const char *buf, int bytes);
+extern uint64 pg_popcount_masked_sve(const char *buf, int bytes, bits8 mask);
+#endif
+
 #else
 /* Use a portable implementation -- no need for a function pointer. */
 extern int	pg_popcount32(uint32 word);
diff --git a/src/makefiles/meson.build b/src/makefiles/meson.build
index aba7411a1b..c0207426c2 100644
--- a/src/makefiles/meson.build
+++ b/src/makefiles/meson.build
@@ -102,6 +102,7 @@ pgxs_kv = {
     ' '.join(cflags_no_missing_var_decls),
 
   'CFLAGS_CRC': ' '.join(cflags_crc),
+  'CFLAGS_POPCNT_ARM': ' '.join(cflags_popcnt_arm)
   'CFLAGS_UNROLL_LOOPS': ' '.join(unroll_loops_cflags),
   'CFLAGS_VECTORIZE': ' '.join(vectorize_cflags),
 
@@ -179,7 +180,7 @@ pgxs_empty = [
   'WANTED_LANGUAGES',
 
   # Not needed because we don't build the server / PLs with the generated makefile
-  'LIBOBJS', 'PG_CRC32C_OBJS', 'TAS',
+  'LIBOBJS', 'PG_CRC32C_OBJS', 'PG_POPCNT_OBJS', 'PG_POPCNT_OBJS_ARM', 'TAS',
   'PG_TEST_EXTRA',
   'DTRACEFLAGS', # only server has dtrace probes
 
diff --git a/src/port/Makefile b/src/port/Makefile
index 4c22431951..2e04ea4d5a 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -38,6 +38,7 @@ LIBS += $(PTHREAD_LIBS)
 OBJS = \
 	$(LIBOBJS) \
 	$(PG_CRC32C_OBJS) \
+	$(PG_POPCNT_OBJS_ARM) \
 	bsearch_arg.o \
 	chklocale.o \
 	inet_net_ntop.o \
@@ -87,6 +88,16 @@ pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_CRC)
 
+# all version of pg_popcount_sve.o need CFLAGS_POPCNT_ARM
+pg_popcount_sve.o: CFLAGS+=$(CFLAGS_POPCNT_ARM)
+pg_popcount_sve_shlib.o: CFLAGS+=$(CFLAGS_POPCNT_ARM)
+pg_popcount_sve_srv.o: CFLAGS+=$(CFLAGS_POPCNT_ARM)
+
+# all versions of pg_popcount_sve_choose.o need CFLAGS_POPCNT_ARM
+pg_popcount_sve_choose.o: CFLAGS+=$(CFLAGS_POPCNT_ARM)
+pg_popcount_sve_choose_shlib.o: CFLAGS+=$(CFLAGS_POPCNT_ARM)
+pg_popcount_sve_choose_srv.o: CFLAGS+=$(CFLAGS_POPCNT_ARM)
+
 #
 # Shared library versions of object files
 #
diff --git a/src/port/meson.build b/src/port/meson.build
index c5bceed9cd..21d686a26e 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -91,6 +91,8 @@ replace_funcs_pos = [
   ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
   ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
   ['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
+  ['pg_popcount_sve', 'USE_SVE_POPCNT_WITH_RUNTIME_CHECK', 'popcnt'],
+  ['pg_popcount_sve_choose', 'USE_SVE_POPCNT_WITH_RUNTIME_CHECK'],
 
   # loongarch
   ['pg_crc32c_loongarch', 'USE_LOONGARCH_CRC32C'],
@@ -99,7 +101,7 @@ replace_funcs_pos = [
   ['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'],
 ]
 
-pgport_cflags = {'crc': cflags_crc}
+pgport_cflags = {'crc': cflags_crc, 'popcnt': cflags_popcnt + cflags_popcnt_arm}
 pgport_sources_cflags = {'crc': []}
 
 foreach f : replace_funcs_neg
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index c8399981ee..6b2e6b3794 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -135,7 +135,9 @@ pg_popcount_available(void)
 {
 	unsigned int exx[4] = {0, 0, 0, 0};
 
-#if defined(HAVE__GET_CPUID)
+#if defined(__aarch64__)
+	return false;						/* cpuid not available in __aarch64__ */
+#elif defined(HAVE__GET_CPUID)
 	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
 #elif defined(HAVE__CPUID)
 	__cpuid(exx, 1);
@@ -176,6 +178,12 @@ choose_popcount_functions(void)
 		pg_popcount_optimized = pg_popcount_avx512;
 		pg_popcount_masked_optimized = pg_popcount_masked_avx512;
 	}
+#elif USE_SVE_POPCNT_WITH_RUNTIME_CHECK
+	if (pg_popcount_sve_available())
+	{
+		pg_popcount_optimized = pg_popcount_sve;
+		pg_popcount_masked_optimized = pg_popcount_masked_sve;
+	}
 #endif
 }
 
diff --git a/src/port/pg_popcount_sve.c b/src/port/pg_popcount_sve.c
new file mode 100644
index 0000000000..c2a3a4cba0
--- /dev/null
+++ b/src/port/pg_popcount_sve.c
@@ -0,0 +1,134 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_popcount_sve.c
+ *	  Holds the SVE pg_popcount() implementation.
+ *
+ * Copyright (c) 2024, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_popcount_sve.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+#include "port/pg_bitutils.h"
+
+#include <arm_sve.h>
+
+#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
+
+/*
+ * pg_popcount_sve
+ *		Returns the number of 1-bits in buf
+ */
+uint64
+pg_popcount_sve(const char *buf, int bytes)
+{
+	svbool_t    pred;
+	svuint64_t  vec64,
+				accum1 = svdup_u64(0),
+				accum2 = svdup_u64(0);
+	uint32		i = 0,
+				vec_len = svcntb(),
+				pre_align,
+				loop_bytes;
+	uint64      popcnt = 0;
+	const char *aligned = (const char *) TYPEALIGN_DOWN(sizeof(uint64_t), buf);
+
+	/*
+	 * For smaller inputs, aligning the buffer degrades the performance.
+	 * Therefore, we align the buffers only when the input size is sufficiently large.
+	 */
+	if (aligned != buf && bytes > 4 * vec_len)
+	{
+		pre_align = aligned + sizeof(uint64_t) - buf;
+		pred = svwhilelt_b8(0U, pre_align);
+		popcnt = svaddv(pred, svcnt_z(pred, svld1(pred, (const uint8 *) buf)));
+		buf += pre_align;
+		bytes -= pre_align;
+	}
+
+	pred = svptrue_b64();
+	loop_bytes = bytes & ~(vec_len * 2 - 1);
+
+	/* Process 2 complete vectors */
+	for (; i < loop_bytes; i += vec_len * 2)
+	{
+		vec64 = svld1(pred, (const uint64 *) (buf + i));
+		accum1 = svadd_x(pred, accum1, svcnt_x(pred, vec64));
+		vec64 = svld1(pred, (const uint64 *) (buf + i + vec_len));
+		accum2 = svadd_x(pred, accum2, svcnt_x(pred, vec64));
+	}
+
+	popcnt += svaddv(pred, svadd_x(pred, accum1, accum2));	/* reduce the accumulators */
+
+	/* Process the last incomplete vector  */
+	for(; i < bytes; i += vec_len)
+	{
+		pred = svwhilelt_b8(i, (uint32) bytes);
+		popcnt += svaddv(pred, svcnt_z(pred, svld1(pred, (const uint8 *) (buf + i))));
+	}
+
+	return popcnt;
+}
+
+/*
+ * pg_popcount_masked_sve
+ *		Returns the number of 1-bits in buf after applying the mask to each byte
+ */
+uint64
+pg_popcount_masked_sve(const char *buf, int bytes, bits8 mask)
+{
+	svbool_t	pred;
+	svuint8_t   vec8;
+	svuint64_t  vec64,
+				accum1 = svdup_u64(0),
+				accum2 = svdup_u64(0);
+	uint32		i = 0,
+				vec_len = svcntb(),
+				pre_align,
+				loop_bytes;
+	uint64		popcnt = 0,
+				mask64 = ~UINT64CONST(0) / 0xFF * mask;
+	const char *aligned = (const char *) TYPEALIGN_DOWN(sizeof(uint64_t), buf);
+
+	/*
+	 * For smaller inputs, aligning the buffer degrades the performance.
+	 * Therefore, we align the buffers only when the input size is sufficiently large.
+	 */
+	if (aligned != buf && bytes > 4 * vec_len)
+	{
+		pre_align = aligned + sizeof(uint64_t) - buf;
+		pred = svwhilelt_b8(0U, pre_align);
+		vec8 = svand_n_u8_m(pred, svld1(pred, (const uint8 *) buf), mask);  /* load and mask */
+		popcnt = svaddv(pred, svcnt_z(pred, vec8));
+		buf += pre_align;
+		bytes -= pre_align;
+	}
+
+	pred = svptrue_b64();
+	loop_bytes = bytes & ~(vec_len * 2 - 1);
+
+	/* Process 2 complete vectors */
+	for (; i < loop_bytes; i += vec_len * 2)
+	{
+		vec64 = svand_n_u64_x(pred, svld1(pred, (const uint64 *) (buf + i)), mask64);
+		accum1 = svadd_x(pred, accum1, svcnt_x(pred, vec64));
+		vec64 = svand_n_u64_x(pred, svld1(pred, (const uint64 *) (buf + i + vec_len)), mask64);
+		accum2 = svadd_x(pred, accum2, svcnt_x(pred, vec64));
+	}
+
+	popcnt += svaddv(pred, svadd_x(pred, accum1, accum2));	/* reduce the accumulators */
+
+	/* Process the last incomplete vectors */
+	for(; i < bytes; i += vec_len)
+	{
+		pred = svwhilelt_b8(i, (uint32) bytes);
+		vec8 = svand_n_u8_m(pred, svld1(pred, (const uint8 *) (buf + i)), mask);
+		popcnt += svaddv(pred, svcnt_z(pred, vec8));
+	}
+
+	return popcnt;
+}
+
+#endif							/* USE_SVE_POPCNT_WITH_RUNTIME_CHECK */
diff --git a/src/port/pg_popcount_sve_choose.c b/src/port/pg_popcount_sve_choose.c
new file mode 100644
index 0000000000..5f4e164f9c
--- /dev/null
+++ b/src/port/pg_popcount_sve_choose.c
@@ -0,0 +1,32 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_popcount_sve_choose.c
+ *    Test whether we can use the SVE pg_popcount() implementation.
+ *
+ * Copyright (c) 2024, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *    src/port/pg_popcount_sve_choose.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+#include "port/pg_bitutils.h"
+
+#include <asm/hwcap.h>
+#include <sys/auxv.h>
+
+#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
+
+/*
+ * Returns true if the CPU supports the instructions required for the SVE
+ * pg_popcount() implementation.
+ */
+bool
+pg_popcount_sve_available(void)
+{
+	unsigned long hwcap = getauxval(AT_HWCAP); /* get the HWCAP flags */
+	return (hwcap & HWCAP_SVE) != 0; /* return true if SVE is supported */
+}
+
+#endif							/* USE_SVE_POPCNT_WITH_RUNTIME_CHECK */
-- 
2.34.1

