On Mon, Apr 01, 2024 at 05:11:17PM -0500, Nathan Bossart wrote:
> Here is a v19 of the patch set. I moved out the refactoring of the
> function pointer selection code to 0001. I think this is a good change
> independent of $SUBJECT, and I plan to commit this soon. In 0002, I
> changed the syslogger.c usage of pg_popcount() to use pg_number_of_ones
> instead. This is standard practice elsewhere where the popcount functions
> are unlikely to win. I'll probably commit this one soon, too, as it's even
> more trivial than 0001.
>
> 0003 is the AVX512 POPCNT patch. Besides refactoring out 0001, there are
> no changes from v18. 0004 is an early proof-of-concept for using AVX512
> for the visibility map code. The code is missing comments, and I haven't
> performed any benchmarking yet, but I figured I'd post it because it
> demonstrates how it's possible to build upon 0003 in other areas.
I've committed the first two patches, and I've attached a rebased version
of the latter two.
> AFAICT the main open question is the function call overhead in 0003 that
> Alvaro brought up earlier. After 0002 is committed, I believe the only
> in-tree caller of pg_popcount() with very few bytes is bit_count(), and I'm
> not sure it's worth expending too much energy to make sure there are
> absolutely no regressions there. However, I'm happy to do so if folks feel
> that it is necessary, and I'd be grateful for thoughts on how to proceed on
> this one.
Another idea I had is to turn pg_popcount() into a macro that just uses the
pg_number_of_ones array when called for few bytes:
static inline uint64
pg_popcount_inline(const char *buf, int bytes)
{
uint64 popcnt = 0;
while (bytes--)
popcnt += pg_number_of_ones[(unsigned char) *buf++];
return popcnt;
}
#define pg_popcount(buf, bytes) \
((bytes < 64) ? \
pg_popcount_inline(buf, bytes) : \
pg_popcount_optimized(buf, bytes))
But again, I'm not sure this is really worth it for the current use-cases.
--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com
>From 3c5c3fdaffd623b513bcc476ee7c15f6379af1e7 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <[email protected]>
Date: Wed, 27 Mar 2024 16:39:24 -0500
Subject: [PATCH v20 1/2] AVX512 popcount support
---
config/c-compiler.m4 | 58 ++++++
configure | 252 +++++++++++++++++++++++++++
configure.ac | 51 ++++++
meson.build | 87 +++++++++
src/Makefile.global.in | 5 +
src/include/pg_config.h.in | 12 ++
src/include/port/pg_bitutils.h | 15 ++
src/makefiles/meson.build | 4 +-
src/port/Makefile | 11 ++
src/port/meson.build | 6 +-
src/port/pg_bitutils.c | 7 +-
src/port/pg_popcount_avx512.c | 49 ++++++
src/port/pg_popcount_avx512_choose.c | 71 ++++++++
src/test/regress/expected/bit.out | 24 +++
src/test/regress/sql/bit.sql | 4 +
15 files changed, 651 insertions(+), 5 deletions(-)
create mode 100644 src/port/pg_popcount_avx512.c
create mode 100644 src/port/pg_popcount_avx512_choose.c
diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 3268a780bb..5fb60775ca 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -694,3 +694,61 @@ if test x"$Ac_cachevar" = x"yes"; then
fi
undefine([Ac_cachevar])dnl
])# PGAC_LOONGARCH_CRC32C_INTRINSICS
+
+# PGAC_XSAVE_INTRINSICS
+# ---------------------
+# Check if the compiler supports the XSAVE instructions using the _xgetbv
+# intrinsic function.
+#
+# An optional compiler flag can be passed as argument (e.g., -mxsave). If the
+# intrinsic is supported, sets pgac_xsave_intrinsics and CFLAGS_XSAVE.
+AC_DEFUN([PGAC_XSAVE_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_xsave_intrinsics_$1])])dnl
+AC_CACHE_CHECK([for _xgetbv with CFLAGS=$1], [Ac_cachevar],
+[pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS $1"
+AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>],
+ [return _xgetbv(0) & 0xe0;])],
+ [Ac_cachevar=yes],
+ [Ac_cachevar=no])
+CFLAGS="$pgac_save_CFLAGS"])
+if test x"$Ac_cachevar" = x"yes"; then
+ CFLAGS_XSAVE="$1"
+ pgac_xsave_intrinsics=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_XSAVE_INTRINSICS
+
+# PGAC_AVX512_POPCNT_INTRINSICS
+# -----------------------------
+# Check if the compiler supports the AVX512 POPCNT instructions using the
+# _mm512_setzero_si512, _mm512_loadu_si512, _mm512_popcnt_epi64,
+# _mm512_add_epi64, and _mm512_reduce_add_epi64 intrinsic functions.
+#
+# An optional compiler flag can be passed as argument
+# (e.g., -mavx512vpopcntdq). If the intrinsics are supported, sets
+# pgac_avx512_popcnt_intrinsics and CFLAGS_POPCNT.
+AC_DEFUN([PGAC_AVX512_POPCNT_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_popcnt_intrinsics_$1])])dnl
+AC_CACHE_CHECK([for _mm512_popcnt_epi64 with CFLAGS=$1], [Ac_cachevar],
+[pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS $1"
+AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>],
+ [const char buf@<:@sizeof(__m512i)@:>@;
+ PG_INT64_TYPE popcnt = 0;
+ __m512i accum = _mm512_setzero_si512();
+ const __m512i val = _mm512_loadu_si512((const __m512i *) buf);
+ const __m512i cnt = _mm512_popcnt_epi64(val);
+ accum = _mm512_add_epi64(accum, cnt);
+ popcnt = _mm512_reduce_add_epi64(accum);
+ /* return computed value, to prevent the above being optimized away */
+ return popcnt == 0;])],
+ [Ac_cachevar=yes],
+ [Ac_cachevar=no])
+CFLAGS="$pgac_save_CFLAGS"])
+if test x"$Ac_cachevar" = x"yes"; then
+ CFLAGS_POPCNT="$1"
+ pgac_avx512_popcnt_intrinsics=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_AVX512_POPCNT_INTRINSICS
diff --git a/configure b/configure
index 36feeafbb2..b48ed7f271 100755
--- a/configure
+++ b/configure
@@ -647,6 +647,9 @@ MSGFMT_FLAGS
MSGFMT
PG_CRC32C_OBJS
CFLAGS_CRC
+PG_POPCNT_OBJS
+CFLAGS_POPCNT
+CFLAGS_XSAVE
LIBOBJS
OPENSSL
ZSTD
@@ -17404,6 +17407,40 @@ $as_echo "#define HAVE__GET_CPUID 1" >>confdefs.h
fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __get_cpuid_count" >&5
+$as_echo_n "checking for __get_cpuid_count... " >&6; }
+if ${pgac_cv__get_cpuid_count+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+#include <cpuid.h>
+int
+main ()
+{
+unsigned int exx[4] = {0, 0, 0, 0};
+ __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
+
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+ pgac_cv__get_cpuid_count="yes"
+else
+ pgac_cv__get_cpuid_count="no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+ conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__get_cpuid_count" >&5
+$as_echo "$pgac_cv__get_cpuid_count" >&6; }
+if test x"$pgac_cv__get_cpuid_count" = x"yes"; then
+
+$as_echo "#define HAVE__GET_CPUID_COUNT 1" >>confdefs.h
+
+fi
+
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __cpuid" >&5
$as_echo_n "checking for __cpuid... " >&6; }
if ${pgac_cv__cpuid+:} false; then :
@@ -17438,6 +17475,221 @@ $as_echo "#define HAVE__CPUID 1" >>confdefs.h
fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __cpuidex" >&5
+$as_echo_n "checking for __cpuidex... " >&6; }
+if ${pgac_cv__cpuidex+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+#include <intrin.h>
+int
+main ()
+{
+unsigned int exx[4] = {0, 0, 0, 0};
+ __get_cpuidex(exx[0], 7, 0);
+
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+ pgac_cv__cpuidex="yes"
+else
+ pgac_cv__cpuidex="no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+ conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__cpuidex" >&5
+$as_echo "$pgac_cv__cpuidex" >&6; }
+if test x"$pgac_cv__cpuidex" = x"yes"; then
+
+$as_echo "#define HAVE__CPUIDEX 1" >>confdefs.h
+
+fi
+
+# Check for XSAVE intrinsics
+#
+CFLAGS_XSAVE=""
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _xgetbv with CFLAGS=" >&5
+$as_echo_n "checking for _xgetbv with CFLAGS=... " >&6; }
+if ${pgac_cv_xsave_intrinsics_+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS "
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+#include <immintrin.h>
+int
+main ()
+{
+return _xgetbv(0) & 0xe0;
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+ pgac_cv_xsave_intrinsics_=yes
+else
+ pgac_cv_xsave_intrinsics_=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+ conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_xsave_intrinsics_" >&5
+$as_echo "$pgac_cv_xsave_intrinsics_" >&6; }
+if test x"$pgac_cv_xsave_intrinsics_" = x"yes"; then
+ CFLAGS_XSAVE=""
+ pgac_xsave_intrinsics=yes
+fi
+
+if test x"$pgac_xsave_intrinsics" != x"yes"; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _xgetbv with CFLAGS=-mxsave" >&5
+$as_echo_n "checking for _xgetbv with CFLAGS=-mxsave... " >&6; }
+if ${pgac_cv_xsave_intrinsics__mxsave+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS -mxsave"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+#include <immintrin.h>
+int
+main ()
+{
+return _xgetbv(0) & 0xe0;
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+ pgac_cv_xsave_intrinsics__mxsave=yes
+else
+ pgac_cv_xsave_intrinsics__mxsave=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+ conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_xsave_intrinsics__mxsave" >&5
+$as_echo "$pgac_cv_xsave_intrinsics__mxsave" >&6; }
+if test x"$pgac_cv_xsave_intrinsics__mxsave" = x"yes"; then
+ CFLAGS_XSAVE="-mxsave"
+ pgac_xsave_intrinsics=yes
+fi
+
+fi
+if test x"$pgac_xsave_intrinsics" = x"yes"; then
+
+$as_echo "#define HAVE_XSAVE_INTRINSICS 1" >>confdefs.h
+
+fi
+
+
+# Check for AVX512 popcount intrinsics
+#
+CFLAGS_POPCNT=""
+PG_POPCNT_OBJS=""
+if test x"$host_cpu" = x"x86_64"; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64 with CFLAGS=" >&5
+$as_echo_n "checking for _mm512_popcnt_epi64 with CFLAGS=... " >&6; }
+if ${pgac_cv_avx512_popcnt_intrinsics_+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS "
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+#include <immintrin.h>
+int
+main ()
+{
+const char buf[sizeof(__m512i)];
+ PG_INT64_TYPE popcnt = 0;
+ __m512i accum = _mm512_setzero_si512();
+ const __m512i val = _mm512_loadu_si512((const __m512i *) buf);
+ const __m512i cnt = _mm512_popcnt_epi64(val);
+ accum = _mm512_add_epi64(accum, cnt);
+ popcnt = _mm512_reduce_add_epi64(accum);
+ /* return computed value, to prevent the above being optimized away */
+ return popcnt == 0;
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+ pgac_cv_avx512_popcnt_intrinsics_=yes
+else
+ pgac_cv_avx512_popcnt_intrinsics_=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+ conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_popcnt_intrinsics_" >&5
+$as_echo "$pgac_cv_avx512_popcnt_intrinsics_" >&6; }
+if test x"$pgac_cv_avx512_popcnt_intrinsics_" = x"yes"; then
+ CFLAGS_POPCNT=""
+ pgac_avx512_popcnt_intrinsics=yes
+fi
+
+ if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64 with CFLAGS=-mavx512vpopcntdq" >&5
+$as_echo_n "checking for _mm512_popcnt_epi64 with CFLAGS=-mavx512vpopcntdq... " >&6; }
+if ${pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS -mavx512vpopcntdq"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+#include <immintrin.h>
+int
+main ()
+{
+const char buf[sizeof(__m512i)];
+ PG_INT64_TYPE popcnt = 0;
+ __m512i accum = _mm512_setzero_si512();
+ const __m512i val = _mm512_loadu_si512((const __m512i *) buf);
+ const __m512i cnt = _mm512_popcnt_epi64(val);
+ accum = _mm512_add_epi64(accum, cnt);
+ popcnt = _mm512_reduce_add_epi64(accum);
+ /* return computed value, to prevent the above being optimized away */
+ return popcnt == 0;
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+ pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq=yes
+else
+ pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+ conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq" >&5
+$as_echo "$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq" >&6; }
+if test x"$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq" = x"yes"; then
+ CFLAGS_POPCNT="-mavx512vpopcntdq"
+ pgac_avx512_popcnt_intrinsics=yes
+fi
+
+ fi
+ if test x"$pgac_avx512_popcnt_intrinsics" = x"yes"; then
+ PG_POPCNT_OBJS="pg_popcount_avx512.o pg_popcount_avx512_choose.o"
+
+$as_echo "#define USE_AVX512_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+ fi
+fi
+
+
+
# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
#
# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
diff --git a/configure.ac b/configure.ac
index 57f734879e..2bbd81dfb8 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2052,6 +2052,17 @@ if test x"$pgac_cv__get_cpuid" = x"yes"; then
AC_DEFINE(HAVE__GET_CPUID, 1, [Define to 1 if you have __get_cpuid.])
fi
+AC_CACHE_CHECK([for __get_cpuid_count], [pgac_cv__get_cpuid_count],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <cpuid.h>],
+ [[unsigned int exx[4] = {0, 0, 0, 0};
+ __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
+ ]])],
+ [pgac_cv__get_cpuid_count="yes"],
+ [pgac_cv__get_cpuid_count="no"])])
+if test x"$pgac_cv__get_cpuid_count" = x"yes"; then
+ AC_DEFINE(HAVE__GET_CPUID_COUNT, 1, [Define to 1 if you have __get_cpuid_count.])
+fi
+
AC_CACHE_CHECK([for __cpuid], [pgac_cv__cpuid],
[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <intrin.h>],
[[unsigned int exx[4] = {0, 0, 0, 0};
@@ -2063,6 +2074,46 @@ if test x"$pgac_cv__cpuid" = x"yes"; then
AC_DEFINE(HAVE__CPUID, 1, [Define to 1 if you have __cpuid.])
fi
+AC_CACHE_CHECK([for __cpuidex], [pgac_cv__cpuidex],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <intrin.h>],
+ [[unsigned int exx[4] = {0, 0, 0, 0};
+ __get_cpuidex(exx[0], 7, 0);
+ ]])],
+ [pgac_cv__cpuidex="yes"],
+ [pgac_cv__cpuidex="no"])])
+if test x"$pgac_cv__cpuidex" = x"yes"; then
+ AC_DEFINE(HAVE__CPUIDEX, 1, [Define to 1 if you have __cpuidex.])
+fi
+
+# Check for XSAVE intrinsics
+#
+CFLAGS_XSAVE=""
+PGAC_XSAVE_INTRINSICS([])
+if test x"$pgac_xsave_intrinsics" != x"yes"; then
+ PGAC_XSAVE_INTRINSICS([-mxsave])
+fi
+if test x"$pgac_xsave_intrinsics" = x"yes"; then
+ AC_DEFINE(HAVE_XSAVE_INTRINSICS, 1, [Define to 1 if you have XSAVE intrinsics.])
+fi
+AC_SUBST(CFLAGS_XSAVE)
+
+# Check for AVX512 popcount intrinsics
+#
+CFLAGS_POPCNT=""
+PG_POPCNT_OBJS=""
+if test x"$host_cpu" = x"x86_64"; then
+ PGAC_AVX512_POPCNT_INTRINSICS([])
+ if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then
+ PGAC_AVX512_POPCNT_INTRINSICS([-mavx512vpopcntdq])
+ fi
+ if test x"$pgac_avx512_popcnt_intrinsics" = x"yes"; then
+ PG_POPCNT_OBJS="pg_popcount_avx512.o pg_popcount_avx512_choose.o"
+ AC_DEFINE(USE_AVX512_POPCNT_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX512 popcount instructions with a runtime check.])
+ fi
+fi
+AC_SUBST(CFLAGS_POPCNT)
+AC_SUBST(PG_POPCNT_OBJS)
+
# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
#
# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
diff --git a/meson.build b/meson.build
index 18b5be842e..96be29c22b 100644
--- a/meson.build
+++ b/meson.build
@@ -1783,6 +1783,30 @@ elif cc.links('''
endif
+# Check for __get_cpuid_count() and __cpuidex() in a similar fashion.
+if cc.links('''
+ #include <cpuid.h>
+ int main(int arg, char **argv)
+ {
+ unsigned int exx[4] = {0, 0, 0, 0};
+ __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
+ }
+ ''', name: '__get_cpuid_count',
+ args: test_c_args)
+ cdata.set('HAVE__GET_CPUID_COUNT', 1)
+elif cc.links('''
+ #include <intrin.h>
+ int main(int arg, char **argv)
+ {
+ unsigned int exx[4] = {0, 0, 0, 0};
+ __cpuidex(exx, 7, 0);
+ }
+ ''', name: '__cpuidex',
+ args: test_c_args)
+ cdata.set('HAVE__CPUIDEX', 1)
+endif
+
+
# Defend against clang being used on x86-32 without SSE2 enabled. As current
# versions of clang do not understand -fexcess-precision=standard, the use of
# x87 floating point operations leads to problems like isinf possibly returning
@@ -1996,6 +2020,69 @@ int main(void)
endif
+###############################################################
+# Check for the availability of XSAVE intrinsics.
+###############################################################
+
+cflags_xsave = []
+if host_cpu == 'x86' or host_cpu == 'x86_64'
+
+ prog = '''
+#include <immintrin.h>
+
+int main(void)
+{
+ return _xgetbv(0) & 0xe0;
+}
+'''
+
+ if cc.links(prog, name: 'XSAVE intrinsics without -mxsave',
+ args: test_c_args)
+ cdata.set('HAVE_XSAVE_INTRINSICS', 1)
+ elif cc.links(prog, name: 'XSAVE intrinsics with -mxsave',
+ args: test_c_args + ['-mxsave'])
+ cdata.set('HAVE_XSAVE_INTRINSICS', 1)
+ cflags_xsave += '-mxsave'
+ endif
+
+endif
+
+
+###############################################################
+# Check for the availability of AVX512 popcount intrinsics.
+###############################################################
+
+cflags_popcnt = []
+if host_cpu == 'x86_64'
+
+ prog = '''
+#include <immintrin.h>
+
+int main(void)
+{
+ const char buf[sizeof(__m512i)];
+ INT64 popcnt = 0;
+ __m512i accum = _mm512_setzero_si512();
+ const __m512i val = _mm512_loadu_si512((const __m512i *) buf);
+ const __m512i cnt = _mm512_popcnt_epi64(val);
+ accum = _mm512_add_epi64(accum, cnt);
+ popcnt = _mm512_reduce_add_epi64(accum);
+ /* return computed value, to prevent the above being optimized away */
+ return popcnt == 0;
+}
+'''
+
+ if cc.links(prog, name: 'AVX512 popcount without -mavx512vpopcntdq',
+ args: test_c_args + ['-DINT64=@0@'.format(cdata.get('PG_INT64_TYPE'))])
+ cdata.set('USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 1)
+ elif cc.links(prog, name: 'AVX512 popcount with -mavx512vpopcntdq',
+ args: test_c_args + ['-DINT64=@0@'.format(cdata.get('PG_INT64_TYPE'))] + ['-mavx512vpopcntdq'])
+ cdata.set('USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 1)
+ cflags_popcnt += '-mavx512vpopcntdq'
+ endif
+
+endif
+
###############################################################
# Select CRC-32C implementation.
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 8b3f8c24e0..36d880d225 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -262,7 +262,9 @@ CFLAGS_SL_MODULE = @CFLAGS_SL_MODULE@
CXXFLAGS_SL_MODULE = @CXXFLAGS_SL_MODULE@
CFLAGS_UNROLL_LOOPS = @CFLAGS_UNROLL_LOOPS@
CFLAGS_VECTORIZE = @CFLAGS_VECTORIZE@
+CFLAGS_POPCNT = @CFLAGS_POPCNT@
CFLAGS_CRC = @CFLAGS_CRC@
+CFLAGS_XSAVE = @CFLAGS_XSAVE@
PERMIT_DECLARATION_AFTER_STATEMENT = @PERMIT_DECLARATION_AFTER_STATEMENT@
CXXFLAGS = @CXXFLAGS@
@@ -758,6 +760,9 @@ LIBOBJS = @LIBOBJS@
# files needed for the chosen CRC-32C implementation
PG_CRC32C_OBJS = @PG_CRC32C_OBJS@
+# files needed for the chosen popcount implementation
+PG_POPCNT_OBJS = @PG_POPCNT_OBJS@
+
LIBS := -lpgcommon -lpgport $(LIBS)
# to make ws2_32.lib the last library
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 591e1ca3df..de067e6182 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -513,6 +513,9 @@
/* Define to 1 if the assembler supports X86_64's POPCNTQ instruction. */
#undef HAVE_X86_64_POPCNTQ
+/* Define to 1 if you have XSAVE intrinsics. */
+#undef HAVE_XSAVE_INTRINSICS
+
/* Define to 1 if the system has the type `_Bool'. */
#undef HAVE__BOOL
@@ -555,9 +558,15 @@
/* Define to 1 if you have __cpuid. */
#undef HAVE__CPUID
+/* Define to 1 if you have __cpuidex. */
+#undef HAVE__CPUIDEX
+
/* Define to 1 if you have __get_cpuid. */
#undef HAVE__GET_CPUID
+/* Define to 1 if you have __get_cpuid_count. */
+#undef HAVE__GET_CPUID_COUNT
+
/* Define to 1 if your compiler understands _Static_assert. */
#undef HAVE__STATIC_ASSERT
@@ -680,6 +689,9 @@
/* Define to 1 to build with assertion checks. (--enable-cassert) */
#undef USE_ASSERT_CHECKING
+/* Define to 1 to use AVX512 popcount instructions with a runtime check. */
+#undef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
+
/* Define to 1 to build with Bonjour support. (--with-bonjour) */
#undef USE_BONJOUR
diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 53e5239717..1a92c56bcd 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -304,6 +304,21 @@ extern PGDLLIMPORT int (*pg_popcount32) (uint32 word);
extern PGDLLIMPORT int (*pg_popcount64) (uint64 word);
extern PGDLLIMPORT uint64 (*pg_popcount) (const char *buf, int bytes);
+/* Export pg_popcount_fast() for use in the AVX512 implementation. */
+extern uint64 pg_popcount_fast(const char *buf, int bytes);
+
+/*
+ * We can also try to use the AVX512 popcount instruction on some systems.
+ * The implementation of that is located in its own file because it may
+ * require special compiler flags that we don't want to apply to any other
+ * files. Note that we only build this when TRY_POPCNT_FAST is set so that we
+ * can fall back to pg_popcount_fast() as needed.
+ */
+#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
+extern bool pg_popcount_avx512_available(void);
+extern uint64 pg_popcount_avx512(const char *buf, int bytes);
+#endif
+
#else
/* Use a portable implementation -- no need for a function pointer. */
extern int pg_popcount32(uint32 word);
diff --git a/src/makefiles/meson.build b/src/makefiles/meson.build
index b0f4178b3d..5618050b30 100644
--- a/src/makefiles/meson.build
+++ b/src/makefiles/meson.build
@@ -100,8 +100,10 @@ pgxs_kv = {
' '.join(cflags_no_decl_after_statement),
'CFLAGS_CRC': ' '.join(cflags_crc),
+ 'CFLAGS_POPCNT': ' '.join(cflags_popcnt),
'CFLAGS_UNROLL_LOOPS': ' '.join(unroll_loops_cflags),
'CFLAGS_VECTORIZE': ' '.join(vectorize_cflags),
+ 'CFLAGS_XSAVE': ' '.join(cflags_xsave),
'LDFLAGS': var_ldflags,
'LDFLAGS_EX': var_ldflags_ex,
@@ -177,7 +179,7 @@ pgxs_empty = [
'WANTED_LANGUAGES',
# Not needed because we don't build the server / PLs with the generated makefile
- 'LIBOBJS', 'PG_CRC32C_OBJS', 'TAS',
+ 'LIBOBJS', 'PG_CRC32C_OBJS', 'PG_POPCNT_OBJS', 'TAS',
'DTRACEFLAGS', # only server has dtrace probes
'perl_archlibexp', 'perl_embed_ccflags', 'perl_embed_ldflags', 'perl_includespec', 'perl_privlibexp',
diff --git a/src/port/Makefile b/src/port/Makefile
index dcc8737e68..db7c02117b 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -38,6 +38,7 @@ LIBS += $(PTHREAD_LIBS)
OBJS = \
$(LIBOBJS) \
$(PG_CRC32C_OBJS) \
+ $(PG_POPCNT_OBJS) \
bsearch_arg.o \
chklocale.o \
inet_net_ntop.o \
@@ -92,6 +93,16 @@ pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC)
pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC)
pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_CRC)
+# all versions of pg_popcount_avx512_choose.o need CFLAGS_XSAVE
+pg_popcount_avx512_choose.o: CFLAGS+=$(CFLAGS_XSAVE)
+pg_popcount_avx512_choose_shlib.o: CFLAGS+=$(CFLAGS_XSAVE)
+pg_popcount_avx512_choose_srv.o: CFLAGS+=$(CFLAGS_XSAVE)
+
+# all versions of pg_popcount_avx512.o need CFLAGS_POPCNT
+pg_popcount_avx512.o: CFLAGS+=$(CFLAGS_POPCNT)
+pg_popcount_avx512_shlib.o: CFLAGS+=$(CFLAGS_POPCNT)
+pg_popcount_avx512_srv.o: CFLAGS+=$(CFLAGS_POPCNT)
+
#
# Shared library versions of object files
#
diff --git a/src/port/meson.build b/src/port/meson.build
index 92b593e6ef..fd9ee199d1 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -84,6 +84,8 @@ replace_funcs_pos = [
['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
+ ['pg_popcount_avx512', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 'popcnt'],
+ ['pg_popcount_avx512_choose', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 'xsave'],
# arm / aarch64
['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'],
@@ -98,8 +100,8 @@ replace_funcs_pos = [
['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'],
]
-pgport_cflags = {'crc': cflags_crc}
-pgport_sources_cflags = {'crc': []}
+pgport_cflags = {'crc': cflags_crc, 'popcnt': cflags_popcnt, 'xsave': cflags_xsave}
+pgport_sources_cflags = {'crc': [], 'popcnt': [], 'xsave': []}
foreach f : replace_funcs_neg
func = f.get(0)
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 28312f3dd9..177509518f 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -114,7 +114,6 @@ static int pg_popcount64_choose(uint64 word);
static uint64 pg_popcount_choose(const char *buf, int bytes);
static inline int pg_popcount32_fast(uint32 word);
static inline int pg_popcount64_fast(uint64 word);
-static uint64 pg_popcount_fast(const char *buf, int bytes);
int (*pg_popcount32) (uint32 word) = pg_popcount32_choose;
int (*pg_popcount64) (uint64 word) = pg_popcount64_choose;
@@ -156,6 +155,10 @@ choose_popcount_functions(void)
pg_popcount32 = pg_popcount32_fast;
pg_popcount64 = pg_popcount64_fast;
pg_popcount = pg_popcount_fast;
+#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
+ if (pg_popcount_avx512_available())
+ pg_popcount = pg_popcount_avx512;
+#endif
}
else
{
@@ -224,7 +227,7 @@ __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc");
* pg_popcount_fast
* Returns the number of 1-bits in buf
*/
-static uint64
+uint64
pg_popcount_fast(const char *buf, int bytes)
{
uint64 popcnt = 0;
diff --git a/src/port/pg_popcount_avx512.c b/src/port/pg_popcount_avx512.c
new file mode 100644
index 0000000000..f86558d1ee
--- /dev/null
+++ b/src/port/pg_popcount_avx512.c
@@ -0,0 +1,49 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_popcount_avx512.c
+ * Holds the pg_popcount() implementation that uses AVX512 instructions.
+ *
+ * Copyright (c) 2024, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/port/pg_popcount_avx512.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#include <immintrin.h>
+
+#include "port/pg_bitutils.h"
+
+/*
+ * It's probably unlikely that TRY_POPCNT_FAST won't be set if we are able to
+ * use AVX512 intrinsics, but we check it anyway to be sure. We rely on
+ * pg_popcount_fast() as our fallback implementation in pg_popcount_avx512().
+ */
+#ifdef TRY_POPCNT_FAST
+
+/*
+ * pg_popcount_avx512
+ * Returns the number of 1-bits in buf
+ */
+uint64
+pg_popcount_avx512(const char *buf, int bytes)
+{
+ uint64 popcnt;
+ __m512i accum = _mm512_setzero_si512();
+
+ for (; bytes >= sizeof(__m512i); bytes -= sizeof(__m512i))
+ {
+ const __m512i val = _mm512_loadu_si512((const __m512i *) buf);
+ const __m512i cnt = _mm512_popcnt_epi64(val);
+
+ accum = _mm512_add_epi64(accum, cnt);
+ buf += sizeof(__m512i);
+ }
+
+ popcnt = _mm512_reduce_add_epi64(accum);
+ return popcnt + pg_popcount_fast(buf, bytes);
+}
+
+#endif /* TRY_POPCNT_FAST */
diff --git a/src/port/pg_popcount_avx512_choose.c b/src/port/pg_popcount_avx512_choose.c
new file mode 100644
index 0000000000..9e81cd33ad
--- /dev/null
+++ b/src/port/pg_popcount_avx512_choose.c
@@ -0,0 +1,71 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_popcount_avx512_choose.c
+ * Test whether we can use AVX512 POPCNT instructions.
+ *
+ * Copyright (c) 2024, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/port/pg_popcount_avx512_choose.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
+#include <cpuid.h>
+#endif
+
+#ifdef HAVE_XSAVE_INTRINSICS
+#include <immintrin.h>
+#endif
+
+#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
+#include <intrin.h>
+#endif
+
+#include "port/pg_bitutils.h"
+
+/*
+ * It's probably unlikely that TRY_POPCNT_FAST won't be set if we are able to
+ * use AVX512 intrinsics, but we check it anyway to be sure. We rely on
+ * pg_popcount_fast() as our fallback implementation in pg_popcount_avx512().
+ */
+#ifdef TRY_POPCNT_FAST
+
+/*
+ * Returns true if the CPU supports AVX512 POPCNT.
+ */
+bool
+pg_popcount_avx512_available(void)
+{
+ unsigned int exx[4] = {0, 0, 0, 0};
+
+ /* does CPUID say there's support for AVX512 POPCNT? */
+#if defined(HAVE__GET_CPUID_COUNT)
+ __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUIDEX)
+ __cpuidex(exx, 7, 0);
+#endif
+ if ((exx[2] & (1 << 14)) == 0) /* avx512vpopcntdq */
+ return false;
+
+ /* does CPUID say there's support for XGETBV? */
+ memset(exx, 0, sizeof(exx));
+#if defined(HAVE__GET_CPUID)
+ __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+ __cpuid(exx, 1);
+#endif
+ if ((exx[2] & (1 << 26)) == 0) /* xsave */
+ return false;
+
+ /* does XGETBV say the ZMM registers are enabled? */
+#ifdef HAVE_XSAVE_INTRINSICS
+ return (_xgetbv(0) & 0xe0) != 0;
+#else
+ return false;
+#endif
+}
+
+#endif /* TRY_POPCNT_FAST */
diff --git a/src/test/regress/expected/bit.out b/src/test/regress/expected/bit.out
index e17cbf42ca..6a436288bb 100644
--- a/src/test/regress/expected/bit.out
+++ b/src/test/regress/expected/bit.out
@@ -740,6 +740,30 @@ SELECT bit_count(B'1111111111'::bit(10));
10
(1 row)
+SELECT bit_count(repeat('0', 100)::bit(100));
+ bit_count
+-----------
+ 0
+(1 row)
+
+SELECT bit_count(repeat('1', 100)::bit(100));
+ bit_count
+-----------
+ 100
+(1 row)
+
+SELECT bit_count(repeat('01', 500)::bit(1000));
+ bit_count
+-----------
+ 500
+(1 row)
+
+SELECT bit_count(repeat('10101', 200)::bit(1000));
+ bit_count
+-----------
+ 600
+(1 row)
+
-- This table is intentionally left around to exercise pg_dump/pg_upgrade
CREATE TABLE bit_defaults(
b1 bit(4) DEFAULT '1001',
diff --git a/src/test/regress/sql/bit.sql b/src/test/regress/sql/bit.sql
index 34230b99fb..8ba6facd03 100644
--- a/src/test/regress/sql/bit.sql
+++ b/src/test/regress/sql/bit.sql
@@ -223,6 +223,10 @@ SELECT overlay(B'0101011100' placing '001' from 20);
-- bit_count
SELECT bit_count(B'0101011100'::bit(10));
SELECT bit_count(B'1111111111'::bit(10));
+SELECT bit_count(repeat('0', 100)::bit(100));
+SELECT bit_count(repeat('1', 100)::bit(100));
+SELECT bit_count(repeat('01', 500)::bit(1000));
+SELECT bit_count(repeat('10101', 200)::bit(1000));
-- This table is intentionally left around to exercise pg_dump/pg_upgrade
CREATE TABLE bit_defaults(
--
2.25.1
>From 6c08e5bc0000fcb7a3cbd485e012b049a671c00b Mon Sep 17 00:00:00 2001
From: Nathan Bossart <[email protected]>
Date: Sun, 31 Mar 2024 22:22:15 -0500
Subject: [PATCH v20 2/2] optimize visibilitymap_count() with AVX512
---
src/backend/access/heap/visibilitymap.c | 25 ++----
src/include/port/pg_bitutils.h | 6 +-
src/port/pg_bitutils.c | 113 ++++++++++++++++++++++++
src/port/pg_popcount_avx512.c | 21 +++++
4 files changed, 144 insertions(+), 21 deletions(-)
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index 1ab6c865e3..8b24e7bc33 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -119,10 +119,8 @@
#define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK)
/* Masks for counting subsets of bits in the visibility map. */
-#define VISIBLE_MASK64 UINT64CONST(0x5555555555555555) /* The lower bit of each
- * bit pair */
-#define FROZEN_MASK64 UINT64CONST(0xaaaaaaaaaaaaaaaa) /* The upper bit of each
- * bit pair */
+#define VISIBLE_MASK8 (0x55) /* The lower bit of each bit pair */
+#define FROZEN_MASK8 (0xaa) /* The upper bit of each bit pair */
/* prototypes for internal routines */
static Buffer vm_readbuf(Relation rel, BlockNumber blkno, bool extend);
@@ -396,7 +394,6 @@ visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_fro
{
Buffer mapBuffer;
uint64 *map;
- int i;
/*
* Read till we fall off the end of the map. We assume that any extra
@@ -414,21 +411,9 @@ visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_fro
*/
map = (uint64 *) PageGetContents(BufferGetPage(mapBuffer));
- StaticAssertStmt(MAPSIZE % sizeof(uint64) == 0,
- "unsupported MAPSIZE");
- if (all_frozen == NULL)
- {
- for (i = 0; i < MAPSIZE / sizeof(uint64); i++)
- nvisible += pg_popcount64(map[i] & VISIBLE_MASK64);
- }
- else
- {
- for (i = 0; i < MAPSIZE / sizeof(uint64); i++)
- {
- nvisible += pg_popcount64(map[i] & VISIBLE_MASK64);
- nfrozen += pg_popcount64(map[i] & FROZEN_MASK64);
- }
- }
+ nvisible += pg_popcount_masked((const char *) map, MAPSIZE, VISIBLE_MASK8);
+ if (all_frozen)
+ nfrozen += pg_popcount_masked((const char *) map, MAPSIZE, FROZEN_MASK8);
ReleaseBuffer(mapBuffer);
}
diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 1a92c56bcd..bafb20a427 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -303,9 +303,11 @@ pg_ceil_log2_64(uint64 num)
extern PGDLLIMPORT int (*pg_popcount32) (uint32 word);
extern PGDLLIMPORT int (*pg_popcount64) (uint64 word);
extern PGDLLIMPORT uint64 (*pg_popcount) (const char *buf, int bytes);
+extern PGDLLIMPORT uint64 (*pg_popcount_masked) (const char *buf, int bytes, bits8 mask);
-/* Export pg_popcount_fast() for use in the AVX512 implementation. */
+/* Exported for use in the AVX512 implementation. */
extern uint64 pg_popcount_fast(const char *buf, int bytes);
+extern uint64 pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask);
/*
* We can also try to use the AVX512 popcount instruction on some systems.
@@ -317,6 +319,7 @@ extern uint64 pg_popcount_fast(const char *buf, int bytes);
#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
extern bool pg_popcount_avx512_available(void);
extern uint64 pg_popcount_avx512(const char *buf, int bytes);
+extern uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask);
#endif
#else
@@ -324,6 +327,7 @@ extern uint64 pg_popcount_avx512(const char *buf, int bytes);
extern int pg_popcount32(uint32 word);
extern int pg_popcount64(uint64 word);
extern uint64 pg_popcount(const char *buf, int bytes);
+extern uint64 pg_popcount_masked(const char *buf, int bytes, bits8 mask);
#endif /* TRY_POPCNT_FAST */
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 177509518f..902ecdebbf 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -106,18 +106,21 @@ const uint8 pg_number_of_ones[256] = {
static inline int pg_popcount32_slow(uint32 word);
static inline int pg_popcount64_slow(uint64 word);
static uint64 pg_popcount_slow(const char *buf, int bytes);
+static uint64 pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask);
#ifdef TRY_POPCNT_FAST
static bool pg_popcount_available(void);
static int pg_popcount32_choose(uint32 word);
static int pg_popcount64_choose(uint64 word);
static uint64 pg_popcount_choose(const char *buf, int bytes);
+static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask);
static inline int pg_popcount32_fast(uint32 word);
static inline int pg_popcount64_fast(uint64 word);
int (*pg_popcount32) (uint32 word) = pg_popcount32_choose;
int (*pg_popcount64) (uint64 word) = pg_popcount64_choose;
uint64 (*pg_popcount) (const char *buf, int bytes) = pg_popcount_choose;
+uint64 (*pg_popcount_masked) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose;
#endif /* TRY_POPCNT_FAST */
#ifdef TRY_POPCNT_FAST
@@ -155,9 +158,13 @@ choose_popcount_functions(void)
pg_popcount32 = pg_popcount32_fast;
pg_popcount64 = pg_popcount64_fast;
pg_popcount = pg_popcount_fast;
+ pg_popcount_masked = pg_popcount_masked_fast;
#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
if (pg_popcount_avx512_available())
+ {
pg_popcount = pg_popcount_avx512;
+ pg_popcount_masked = pg_popcount_masked_avx512;
+ }
#endif
}
else
@@ -165,6 +172,7 @@ choose_popcount_functions(void)
pg_popcount32 = pg_popcount32_slow;
pg_popcount64 = pg_popcount64_slow;
pg_popcount = pg_popcount_slow;
+ pg_popcount_masked = pg_popcount_masked_slow;
}
}
@@ -189,6 +197,13 @@ pg_popcount_choose(const char *buf, int bytes)
return pg_popcount(buf, bytes);
}
+static uint64
+pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask)
+{
+ choose_popcount_functions();
+ return pg_popcount_masked(buf, bytes, mask);
+}
+
/*
* pg_popcount32_fast
* Return the number of 1 bits set in word
@@ -269,6 +284,52 @@ pg_popcount_fast(const char *buf, int bytes)
return popcnt;
}
+uint64
+pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask)
+{
+ uint64 popcnt = 0;
+
+#if SIZEOF_VOID_P >= 8
+ /* Process in 64-bit chunks if the buffer is aligned */
+ uint64 maskv = ~UINT64CONST(0) / 0xFF * mask;
+
+ if (buf == (const char *) TYPEALIGN(8, buf))
+ {
+ const uint64 *words = (const uint64 *) buf;
+
+ while (bytes >= 8)
+ {
+ popcnt += pg_popcount64_fast(*words++ & maskv);
+ bytes -= 8;
+ }
+
+ buf = (const char *) words;
+ }
+#else
+ /* Process in 32-bit chunks if the buffer is aligned. */
+ uint32 maskv = ~0 / 0xFF * mask;
+
+ if (buf == (const char *) TYPEALIGN(4, buf))
+ {
+ const uint32 *words = (const uint32 *) buf;
+
+ while (bytes >= 4)
+ {
+ popcnt += pg_popcount32_fast(*words++ & maskv);
+ bytes -= 4;
+ }
+
+ buf = (const char *) words;
+ }
+#endif
+
+ /* Process any remaining bytes */
+ while (bytes--)
+ popcnt += pg_number_of_ones[(unsigned char) *buf++ & maskv];
+
+ return popcnt;
+}
+
#endif /* TRY_POPCNT_FAST */
@@ -368,6 +429,52 @@ pg_popcount_slow(const char *buf, int bytes)
return popcnt;
}
+static uint64
+pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask)
+{
+ uint64 popcnt = 0;
+
+#if SIZEOF_VOID_P >= 8
+ /* Process in 64-bit chunks if the buffer is aligned */
+ uint64 maskv = ~UINT64CONST(0) / 0xFF * mask;
+
+ if (buf == (const char *) TYPEALIGN(8, buf))
+ {
+ const uint64 *words = (const uint64 *) buf;
+
+ while (bytes >= 8)
+ {
+ popcnt += pg_popcount64_slow(*words++ & maskv);
+ bytes -= 8;
+ }
+
+ buf = (const char *) words;
+ }
+#else
+ /* Process in 32-bit chunks if the buffer is aligned. */
+ uint32 maskv = ~0 / 0xFF * mask;
+
+ if (buf == (const char *) TYPEALIGN(4, buf))
+ {
+ const uint32 *words = (const uint32 *) buf;
+
+ while (bytes >= 4)
+ {
+ popcnt += pg_popcount32_slow(*words++ & maskv);
+ bytes -= 4;
+ }
+
+ buf = (const char *) words;
+ }
+#endif
+
+ /* Process any remaining bytes */
+ while (bytes--)
+ popcnt += pg_number_of_ones[(unsigned char) *buf++ & maskv];
+
+ return popcnt;
+}
+
#ifndef TRY_POPCNT_FAST
/*
@@ -399,4 +506,10 @@ pg_popcount(const char *buf, int bytes)
return pg_popcount_slow(buf, bytes);
}
+uint64
+pg_popcount_masked(const char *buf, int bytes, bits8 mask)
+{
+ return pg_popcount_masked_slow(buf, bytes, mask);
+}
+
#endif /* !TRY_POPCNT_FAST */
diff --git a/src/port/pg_popcount_avx512.c b/src/port/pg_popcount_avx512.c
index f86558d1ee..8965a8d530 100644
--- a/src/port/pg_popcount_avx512.c
+++ b/src/port/pg_popcount_avx512.c
@@ -46,4 +46,25 @@ pg_popcount_avx512(const char *buf, int bytes)
return popcnt + pg_popcount_fast(buf, bytes);
}
+uint64
+pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask)
+{
+ uint64 popcnt;
+ __m512i accum = _mm512_setzero_si512();
+ const __m512i maskv = _mm512_set1_epi8(mask);
+
+ for (; bytes >= sizeof(__m512i); bytes -= sizeof(__m512i))
+ {
+ const __m512i val = _mm512_loadu_si512((const __m512i *) buf);
+ const __m512i vmasked = _mm512_and_si512(val, maskv);
+ const __m512i cnt = _mm512_popcnt_epi64(vmasked);
+
+ accum = _mm512_add_epi64(accum, cnt);
+ buf += sizeof(__m512i);
+ }
+
+ popcnt = _mm512_reduce_add_epi64(accum);
+ return popcnt + pg_popcount_masked_fast(buf, bytes, mask);
+}
+
#endif /* TRY_POPCNT_FAST */
--
2.25.1