Okay, here is a slightly different approach that I've dubbed the "maximum
assumption" approach.  In short, I wanted to see how much we could simplify
the patch by making all possibly-reasonable assumptions about the compiler
and CPU.  These include:

* If the compiler understands AVX512 intrinsics, we assume that it also
  knows about the required CPUID and XGETBV intrinsics, and we assume that
  the conditions for TRY_POPCNT_FAST are true.
* If this is x86_64, CPUID will be supported by the CPU.
* If CPUID indicates AVX512 POPCNT support, the CPU also supports XGETBV.

Do any of these assumptions seem unreasonable or unlikely to be true for
all practical purposes?  I don't mind adding back some or all of the
configure/runtime checks if they seem necessary.  I guess the real test
will be the buildfarm...

Another big change in this version is that I've moved
pg_popcount_avx512_available() to its own file so that we only compile
pg_popcount_avx512() with the special compiler flags.  This is just an
oversight in previous versions.

Finally, I've modified the build scripts so that the AVX512 popcount stuff
is conditionally built based on the configure checks for both
autoconf/meson.

-- 
Nathan Bossart
Amazon Web Services: https://aws.amazon.com
>From d7864391c455ea77b8e555e40a358c59de1bd702 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nat...@postgresql.org>
Date: Wed, 27 Mar 2024 16:39:24 -0500
Subject: [PATCH v16 1/1] AVX512 popcount support

---
 config/c-compiler.m4                 |  34 +++++++++
 configure                            | 100 +++++++++++++++++++++++++++
 configure.ac                         |  14 ++++
 meson.build                          |  35 ++++++++++
 src/Makefile.global.in               |   4 ++
 src/include/pg_config.h.in           |   3 +
 src/include/port/pg_bitutils.h       |  17 +++++
 src/makefiles/meson.build            |   3 +-
 src/port/Makefile                    |   6 ++
 src/port/meson.build                 |   6 +-
 src/port/pg_bitutils.c               |  56 ++++++---------
 src/port/pg_popcount_avx512.c        |  40 +++++++++++
 src/port/pg_popcount_avx512_choose.c |  61 ++++++++++++++++
 13 files changed, 340 insertions(+), 39 deletions(-)
 create mode 100644 src/port/pg_popcount_avx512.c
 create mode 100644 src/port/pg_popcount_avx512_choose.c

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 3268a780bb..7d13368b23 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -694,3 +694,37 @@ if test x"$Ac_cachevar" = x"yes"; then
 fi
 undefine([Ac_cachevar])dnl
 ])# PGAC_LOONGARCH_CRC32C_INTRINSICS
+
+# PGAC_AVX512_POPCNT_INTRINSICS
+# -----------------------------
+# Check if the compiler supports the AVX512 POPCNT instructions using the
+# _mm512_setzero_si512, _mm512_loadu_si512, _mm512_popcnt_epi64,
+# _mm512_add_epi64, and _mm512_reduce_add_epi64 intrinsic functions.
+#
+# An optional compiler flag can be passed as argument
+# (e.g., -mavx512vpopcntdq).  If the intrinsics are supported, sets
+# pgac_avx512_popcnt_intrinsics and CFLAGS_POPCNT.
+AC_DEFUN([PGAC_AVX512_POPCNT_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_popcnt_intrinsics_$1])])dnl
+AC_CACHE_CHECK([for _mm512_popcnt_epi64 with CFLAGS=$1], [Ac_cachevar],
+[pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS $1"
+AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>],
+  [const char buf@<:@sizeof(__m512i)@:>@;
+   PG_INT64_TYPE popcnt = 0;
+   __m512i accum = _mm512_setzero_si512();
+   const __m512i val = _mm512_loadu_si512((const __m512i *) buf);
+   const __m512i cnt = _mm512_popcnt_epi64(val);
+   accum = _mm512_add_epi64(accum, cnt);
+   popcnt = _mm512_reduce_add_epi64(accum);
+   /* return computed value, to prevent the above being optimized away */
+   return popcnt == 0;])],
+  [Ac_cachevar=yes],
+  [Ac_cachevar=no])
+CFLAGS="$pgac_save_CFLAGS"])
+if test x"$Ac_cachevar" = x"yes"; then
+  CFLAGS_POPCNT="$1"
+  pgac_avx512_popcnt_intrinsics=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_AVX512_POPCNT_INTRINSICS
diff --git a/configure b/configure
index 36feeafbb2..86c471f4ec 100755
--- a/configure
+++ b/configure
@@ -647,6 +647,8 @@ MSGFMT_FLAGS
 MSGFMT
 PG_CRC32C_OBJS
 CFLAGS_CRC
+CFLAGS_POPCNT
+PG_POPCNT_OBJS
 LIBOBJS
 OPENSSL
 ZSTD
@@ -17438,6 +17440,104 @@ $as_echo "#define HAVE__CPUID 1" >>confdefs.h
 
 fi
 
+# Check for AVX512 popcount intrinsics
+#
+PG_POPCNT_OBJS=""
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64 with CFLAGS=" >&5
+$as_echo_n "checking for _mm512_popcnt_epi64 with CFLAGS=... " >&6; }
+if ${pgac_cv_avx512_popcnt_intrinsics_+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS "
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <immintrin.h>
+int
+main ()
+{
+const char buf[sizeof(__m512i)];
+   PG_INT64_TYPE popcnt = 0;
+   __m512i accum = _mm512_setzero_si512();
+   const __m512i val = _mm512_loadu_si512((const __m512i *) buf);
+   const __m512i cnt = _mm512_popcnt_epi64(val);
+   accum = _mm512_add_epi64(accum, cnt);
+   popcnt = _mm512_reduce_add_epi64(accum);
+   /* return computed value, to prevent the above being optimized away */
+   return popcnt == 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_avx512_popcnt_intrinsics_=yes
+else
+  pgac_cv_avx512_popcnt_intrinsics_=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_popcnt_intrinsics_" >&5
+$as_echo "$pgac_cv_avx512_popcnt_intrinsics_" >&6; }
+if test x"$pgac_cv_avx512_popcnt_intrinsics_" = x"yes"; then
+  CFLAGS_POPCNT=""
+  pgac_avx512_popcnt_intrinsics=yes
+fi
+
+if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64 with CFLAGS=-mavx512vpopcntdq" >&5
+$as_echo_n "checking for _mm512_popcnt_epi64 with CFLAGS=-mavx512vpopcntdq... " >&6; }
+if ${pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS -mavx512vpopcntdq"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <immintrin.h>
+int
+main ()
+{
+const char buf[sizeof(__m512i)];
+   PG_INT64_TYPE popcnt = 0;
+   __m512i accum = _mm512_setzero_si512();
+   const __m512i val = _mm512_loadu_si512((const __m512i *) buf);
+   const __m512i cnt = _mm512_popcnt_epi64(val);
+   accum = _mm512_add_epi64(accum, cnt);
+   popcnt = _mm512_reduce_add_epi64(accum);
+   /* return computed value, to prevent the above being optimized away */
+   return popcnt == 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq=yes
+else
+  pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq" >&5
+$as_echo "$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq" >&6; }
+if test x"$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq" = x"yes"; then
+  CFLAGS_POPCNT="-mavx512vpopcntdq"
+  pgac_avx512_popcnt_intrinsics=yes
+fi
+
+fi
+if test x"$pgac_avx512_popcnt_intrinsics" = x"yes"; then
+  PG_POPCNT_OBJS="pg_popcount_avx512.o pg_popcount_avx512_choose.o"
+
+$as_echo "#define USE_AVX512_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+fi
+
+
+
 # Check for Intel SSE 4.2 intrinsics to do CRC calculations.
 #
 # First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
diff --git a/configure.ac b/configure.ac
index 57f734879e..b1aebb8583 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2063,6 +2063,20 @@ if test x"$pgac_cv__cpuid" = x"yes"; then
   AC_DEFINE(HAVE__CPUID, 1, [Define to 1 if you have __cpuid.])
 fi
 
+# Check for AVX512 popcount intrinsics
+#
+PG_POPCNT_OBJS=""
+PGAC_AVX512_POPCNT_INTRINSICS([])
+if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then
+  PGAC_AVX512_POPCNT_INTRINSICS([-mavx512vpopcntdq])
+fi
+if test x"$pgac_avx512_popcnt_intrinsics" = x"yes"; then
+  PG_POPCNT_OBJS="pg_popcount_avx512.o pg_popcount_avx512_choose.o"
+  AC_DEFINE(USE_AVX512_POPCNT_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX512 popcount instructions with a runtime check.])
+fi
+AC_SUBST(PG_POPCNT_OBJS)
+AC_SUBST(CFLAGS_POPCNT)
+
 # Check for Intel SSE 4.2 intrinsics to do CRC calculations.
 #
 # First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
diff --git a/meson.build b/meson.build
index 18b5be842e..fbd2aa3dbd 100644
--- a/meson.build
+++ b/meson.build
@@ -1996,6 +1996,41 @@ int main(void)
 endif
 
 
+###############################################################
+# Check for the availability of AVX512 popcount intrinsics.
+###############################################################
+
+cflags_popcnt = []
+if host_cpu == 'x86' or host_cpu == 'x86_64'
+
+  prog = '''
+#include <immintrin.h>
+
+int main(void)
+{
+    const char buf[sizeof(__m512i)];
+    INT64 popcnt = 0;
+    __m512i accum = _mm512_setzero_si512();
+    const __m512i val = _mm512_loadu_si512((const __m512i *) buf);
+    const __m512i cnt = _mm512_popcnt_epi64(val);
+    accum = _mm512_add_epi64(accum, cnt);
+    popcnt = _mm512_reduce_add_epi64(accum);
+    /* return computed value, to prevent the above being optimized away */
+    return popcnt == 0;
+}
+'''
+
+  if cc.links(prog, name: 'AVX512 popcount without -mavx512vpopcntdq',
+        args: test_c_args + ['-DINT64=@0@'.format(cdata.get('PG_INT64_TYPE'))])
+    cdata.set('USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 1)
+  elif cc.links(prog, name: 'AVX512 popcount with -mavx512vpopcntdq',
+        args: test_c_args + ['-DINT64=@0@'.format(cdata.get('PG_INT64_TYPE'))] + ['-mavx512vpopcntdq'])
+    cdata.set('USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 1)
+    cflags_popcnt += '-mavx512vpopcntdq'
+  endif
+
+endif
+
 
 ###############################################################
 # Select CRC-32C implementation.
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 8b3f8c24e0..dec467b7dd 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -262,6 +262,7 @@ CFLAGS_SL_MODULE = @CFLAGS_SL_MODULE@
 CXXFLAGS_SL_MODULE = @CXXFLAGS_SL_MODULE@
 CFLAGS_UNROLL_LOOPS = @CFLAGS_UNROLL_LOOPS@
 CFLAGS_VECTORIZE = @CFLAGS_VECTORIZE@
+CFLAGS_POPCNT = @CFLAGS_POPCNT@
 CFLAGS_CRC = @CFLAGS_CRC@
 PERMIT_DECLARATION_AFTER_STATEMENT = @PERMIT_DECLARATION_AFTER_STATEMENT@
 CXXFLAGS = @CXXFLAGS@
@@ -758,6 +759,9 @@ LIBOBJS = @LIBOBJS@
 # files needed for the chosen CRC-32C implementation
 PG_CRC32C_OBJS = @PG_CRC32C_OBJS@
 
+# files needed for the chosen popcount implementation
+PG_POPCNT_OBJS = @PG_POPCNT_OBJS@
+
 LIBS := -lpgcommon -lpgport $(LIBS)
 
 # to make ws2_32.lib the last library
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 591e1ca3df..c271c06b74 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -680,6 +680,9 @@
 /* Define to 1 to build with assertion checks. (--enable-cassert) */
 #undef USE_ASSERT_CHECKING
 
+/* Define to 1 to use AVX512 popcount instructions with a runtime check. */
+#undef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
+
 /* Define to 1 to build with Bonjour support. (--with-bonjour) */
 #undef USE_BONJOUR
 
diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 53e5239717..fc8d34ad25 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -298,12 +298,29 @@ pg_ceil_log2_64(uint64 num)
 #endif
 #endif
 
+/*
+ * We can also try to use the AVX512 popcount instruction on some systems.
+ * The implementation of that is located in its own file because it may
+ * require special compiler flags that we don't want to apply to any other
+ * files.
+ *
+ * NB: We assume that the availability of AVX512 intrinsics implies
+ * TRY_POPCNT_FAST.
+ */
+#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
+extern bool pg_popcount_avx512_available(void);
+extern uint64 pg_popcount_avx512(const char *buf, int bytes);
+#endif
+
 #ifdef TRY_POPCNT_FAST
 /* Attempt to use the POPCNT instruction, but perform a runtime check first */
 extern PGDLLIMPORT int (*pg_popcount32) (uint32 word);
 extern PGDLLIMPORT int (*pg_popcount64) (uint64 word);
 extern PGDLLIMPORT uint64 (*pg_popcount) (const char *buf, int bytes);
 
+/* Export pg_popcount_fast() for use in the AVX512 implementation. */
+extern uint64 pg_popcount_fast(const char *buf, int bytes);
+
 #else
 /* Use a portable implementation -- no need for a function pointer. */
 extern int	pg_popcount32(uint32 word);
diff --git a/src/makefiles/meson.build b/src/makefiles/meson.build
index b0f4178b3d..5a592ddaee 100644
--- a/src/makefiles/meson.build
+++ b/src/makefiles/meson.build
@@ -99,6 +99,7 @@ pgxs_kv = {
   'PERMIT_DECLARATION_AFTER_STATEMENT':
     ' '.join(cflags_no_decl_after_statement),
 
+  'CFLAGS_POPCNT': ' '.join(cflags_popcnt),
   'CFLAGS_CRC': ' '.join(cflags_crc),
   'CFLAGS_UNROLL_LOOPS': ' '.join(unroll_loops_cflags),
   'CFLAGS_VECTORIZE': ' '.join(vectorize_cflags),
@@ -177,7 +178,7 @@ pgxs_empty = [
   'WANTED_LANGUAGES',
 
   # Not needed because we don't build the server / PLs with the generated makefile
-  'LIBOBJS', 'PG_CRC32C_OBJS', 'TAS',
+  'LIBOBJS', 'PG_CRC32C_OBJS', 'PG_POPCNT_OBJS', 'TAS',
   'DTRACEFLAGS', # only server has dtrace probes
 
   'perl_archlibexp', 'perl_embed_ccflags', 'perl_embed_ldflags', 'perl_includespec', 'perl_privlibexp',
diff --git a/src/port/Makefile b/src/port/Makefile
index dcc8737e68..7e154ac379 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -38,6 +38,7 @@ LIBS += $(PTHREAD_LIBS)
 OBJS = \
 	$(LIBOBJS) \
 	$(PG_CRC32C_OBJS) \
+	$(PG_POPCNT_OBJS) \
 	bsearch_arg.o \
 	chklocale.o \
 	inet_net_ntop.o \
@@ -92,6 +93,11 @@ pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_CRC)
 
+# all versions of pg_popcount_avx512.o need CFLAGS_POPCNT
+pg_popcount_avx512.o: CFLAGS+=$(CFLAGS_POPCNT)
+pg_popcount_avx512_shlib.o: CFLAGS+=$(CFLAGS_POPCNT)
+pg_popcount_avx512_srv.o: CFLAGS+=$(CFLAGS_POPCNT)
+
 #
 # Shared library versions of object files
 #
diff --git a/src/port/meson.build b/src/port/meson.build
index 92b593e6ef..7b93233428 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -84,6 +84,8 @@ replace_funcs_pos = [
   ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
   ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
   ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
+  ['pg_popcount_avx512', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 'popcnt'],
+  ['pg_popcount_avx512_choose', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK'],
 
   # arm / aarch64
   ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'],
@@ -98,8 +100,8 @@ replace_funcs_pos = [
   ['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'],
 ]
 
-pgport_cflags = {'crc': cflags_crc}
-pgport_sources_cflags = {'crc': []}
+pgport_cflags = {'crc': cflags_crc, 'popcnt': cflags_popcnt}
+pgport_sources_cflags = {'crc': [], 'popcnt': []}
 
 foreach f : replace_funcs_neg
   func = f.get(0)
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 1197696e97..ada3e777f7 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -114,7 +114,6 @@ static int	pg_popcount64_choose(uint64 word);
 static uint64 pg_popcount_choose(const char *buf, int bytes);
 static inline int pg_popcount32_fast(uint32 word);
 static inline int pg_popcount64_fast(uint64 word);
-static uint64 pg_popcount_fast(const char *buf, int bytes);
 
 int			(*pg_popcount32) (uint32 word) = pg_popcount32_choose;
 int			(*pg_popcount64) (uint64 word) = pg_popcount64_choose;
@@ -142,20 +141,18 @@ pg_popcount_available(void)
 	return (exx[2] & (1 << 23)) != 0;	/* POPCNT */
 }
 
-/*
- * These functions get called on the first call to pg_popcount32 etc.
- * They detect whether we can use the asm implementations, and replace
- * the function pointers so that subsequent calls are routed directly to
- * the chosen implementation.
- */
-static int
-pg_popcount32_choose(uint32 word)
+static inline void
+choose_popcount_functions(void)
 {
 	if (pg_popcount_available())
 	{
 		pg_popcount32 = pg_popcount32_fast;
 		pg_popcount64 = pg_popcount64_fast;
 		pg_popcount = pg_popcount_fast;
+#ifdef USE_AVX512_POPCOUNT_WITH_RUNTIME_CHECK
+		if (pg_popcount_avx512_available())
+			pg_popcount = pg_popcount_avx512;
+#endif
 	}
 	else
 	{
@@ -163,45 +160,32 @@ pg_popcount32_choose(uint32 word)
 		pg_popcount64 = pg_popcount64_slow;
 		pg_popcount = pg_popcount_slow;
 	}
+}
 
+/*
+ * These functions get called on the first call to pg_popcount32 etc.
+ * They detect whether we can use the asm implementations, and replace
+ * the function pointers so that subsequent calls are routed directly to
+ * the chosen implementation.
+ */
+static int
+pg_popcount32_choose(uint32 word)
+{
+	choose_popcount_functions();
 	return pg_popcount32(word);
 }
 
 static int
 pg_popcount64_choose(uint64 word)
 {
-	if (pg_popcount_available())
-	{
-		pg_popcount32 = pg_popcount32_fast;
-		pg_popcount64 = pg_popcount64_fast;
-		pg_popcount = pg_popcount_fast;
-	}
-	else
-	{
-		pg_popcount32 = pg_popcount32_slow;
-		pg_popcount64 = pg_popcount64_slow;
-		pg_popcount = pg_popcount_slow;
-	}
-
+	choose_popcount_functions();
 	return pg_popcount64(word);
 }
 
 static uint64
 pg_popcount_choose(const char *buf, int bytes)
 {
-	if (pg_popcount_available())
-	{
-		pg_popcount32 = pg_popcount32_fast;
-		pg_popcount64 = pg_popcount64_fast;
-		pg_popcount = pg_popcount_fast;
-	}
-	else
-	{
-		pg_popcount32 = pg_popcount32_slow;
-		pg_popcount64 = pg_popcount64_slow;
-		pg_popcount = pg_popcount_slow;
-	}
-
+	choose_popcount_functions();
 	return pg_popcount(buf, bytes);
 }
 
@@ -243,7 +227,7 @@ __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc");
  * pg_popcount_fast
  *		Returns the number of 1-bits in buf
  */
-static uint64
+uint64
 pg_popcount_fast(const char *buf, int bytes)
 {
 	uint64		popcnt = 0;
diff --git a/src/port/pg_popcount_avx512.c b/src/port/pg_popcount_avx512.c
new file mode 100644
index 0000000000..c39db13f85
--- /dev/null
+++ b/src/port/pg_popcount_avx512.c
@@ -0,0 +1,40 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_popcount_avx512.c
+ *	  Holds the pg_popcount() implementation that uses AVX512 instructions.
+ *
+ * Copyright (c) 2019-2024, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_popcount_avx512.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#include <immintrin.h>
+
+#include "port/pg_bitutils.h"
+
+/*
+ * pg_popcount_avx512
+ *		Returns the number of 1-bits in buf
+ */
+uint64
+pg_popcount_avx512(const char *buf, int bytes)
+{
+	uint64		popcnt;
+	__m512i		accum = _mm512_setzero_si512();
+
+	for (; bytes >= sizeof(__m512i); bytes -= sizeof(__m512i))
+	{
+		const		__m512i val = _mm512_loadu_si512((const __m512i *) buf);
+		const		__m512i cnt = _mm512_popcnt_epi64(val);
+
+		accum = _mm512_add_epi64(accum, cnt);
+		buf += sizeof(__m512i);
+	}
+
+	popcnt = _mm512_reduce_add_epi64(accum);
+	return popcnt + pg_popcount_fast(buf, bytes);
+}
diff --git a/src/port/pg_popcount_avx512_choose.c b/src/port/pg_popcount_avx512_choose.c
new file mode 100644
index 0000000000..62ebc515ce
--- /dev/null
+++ b/src/port/pg_popcount_avx512_choose.c
@@ -0,0 +1,61 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_popcount_avx512_choose.c
+ *    Test whether we can use AVX512 POPCNT instructions.
+ *
+ * Copyright (c) 2019-2024, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *    src/port/pg_popcount_avx512_choose.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#ifdef HAVE__GET_CPUID
+#include <cpuid.h>
+#endif
+
+#ifdef HAVE__CPUID
+#include <intrin.h>
+#endif
+
+#include "port/pg_bitutils.h"
+
+/*
+ * Return true if CPUID indicates that the AVX512 POPCNT instruction is
+ * available.
+ *
+ * NB: We assume the availability of AVX512 intrinsics implies availability of
+ * the required CPUID and XGETBV intrinsics.
+ */
+bool
+pg_popcount_avx512_available(void)
+{
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+	__cpuidex(exx, 7, 0);
+#else
+#error cpuid instruction not available
+#endif
+
+	if ((exx[2] & (1 << 14)) != 0)	/* avx512vpopcntdq */
+	{
+		/* Check that the OS has enabled support for the ZMM registers. */
+#ifdef _MSC_VER
+		return (_xgetbv(0) & 0xe0) != 0;
+#else
+		uint64		xcr = 0;
+		uint32		high;
+		uint32		low;
+
+__asm__ __volatile__(" xgetbv\n":"=a"(low), "=d"(high):"c"(xcr));
+		return (low & 0xe0) != 0;
+#endif
+	}
+
+	return false;
+}
-- 
2.25.1

Reply via email to