On Tue, Oct 08, 2024 at 09:36:03PM -0500, Nathan Bossart wrote:
> On Wed, Jul 31, 2024 at 04:43:02PM -0500, Nathan Bossart wrote:
>> On Wed, Jul 31, 2024 at 01:52:54PM -0700, Andres Freund wrote:
>>> I think we'd be better off enabling architectural features on a per-function
>>> basis, roughly like this:
>>>
>>> [...]
>>>
>>> /* FIXME: Should be gated by configure check of -mavx512vpopcntdq
>>> -mavx512bw support */
>>> pg_enable_target("avx512vpopcntdq,avx512bw")
>>> uint64_t
>>> pg_popcount_avx512(const char *buf, int bytes)
>>> ...
>>
>> I remember wondering why the CRC-32C code wasn't already doing something
>> like this (old compiler versions? non-gcc-like compilers?), and I'm not
>> sure I ever discovered the reason, so out of an abundance of caution I used
>> the same approach for AVX-512. If we can convince ourselves that
>> __attribute__((target("..."))) is standard enough at this point, +1 for
>> moving to that.
>
> [...]
>
> So, at least for the CRC code, __attribute__((target("..."))) was probably
> not widely available enough yet when it was first added. Unfortunately,
> the ARMv8 CRC target support (without -march) is still pretty new, but it
> might be possible to switch the others to a per-function approach in v18.
Here is a first attempt at using __attribute__((target("..."))) for the
AVX-512 stuff. Besides allowing us to consolidate the code into a single
file, this simplifies the build file changes quite a bit.
--
nathan
>From c97e25e56347c90f169a5ce069a9ea06c873915b Mon Sep 17 00:00:00 2001
From: Nathan Bossart <[email protected]>
Date: Wed, 16 Oct 2024 15:57:55 -0500
Subject: [PATCH v1 1/1] use __attribute__((target(...))) for AVX-512 stuff
---
config/c-compiler.m4 | 60 +++++-----
configure | 163 ++++++---------------------
configure.ac | 17 +--
meson.build | 17 +--
src/Makefile.global.in | 5 -
src/include/c.h | 10 ++
src/makefiles/meson.build | 4 +-
src/port/Makefile | 12 +-
src/port/meson.build | 7 +-
src/port/pg_popcount_avx512.c | 86 +++++++++++++-
src/port/pg_popcount_avx512_choose.c | 102 -----------------
11 files changed, 171 insertions(+), 312 deletions(-)
delete mode 100644 src/port/pg_popcount_avx512_choose.c
diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 10f8c7bd0a..aa90f8ef33 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -700,20 +700,20 @@ undefine([Ac_cachevar])dnl
# Check if the compiler supports the XSAVE instructions using the _xgetbv
# intrinsic function.
#
-# An optional compiler flag can be passed as argument (e.g., -mxsave). If the
-# intrinsic is supported, sets pgac_xsave_intrinsics and CFLAGS_XSAVE.
+# If the intrinsics are supported, sets pgac_xsave_intrinsics.
AC_DEFUN([PGAC_XSAVE_INTRINSICS],
-[define([Ac_cachevar], [AS_TR_SH([pgac_cv_xsave_intrinsics_$1])])dnl
-AC_CACHE_CHECK([for _xgetbv with CFLAGS=$1], [Ac_cachevar],
-[pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS $1"
-AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>],
- [return _xgetbv(0) & 0xe0;])],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_xsave_intrinsics])])dnl
+AC_CACHE_CHECK([for _xgetbv], [Ac_cachevar],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>
+ __attribute__((target("xsave")))
+ static int xsave_test(void)
+ {
+ return _xgetbv(0) & 0xe0;
+ }],
+ [return xsave_test();])],
[Ac_cachevar=yes],
- [Ac_cachevar=no])
-CFLAGS="$pgac_save_CFLAGS"])
+ [Ac_cachevar=no])])
if test x"$Ac_cachevar" = x"yes"; then
- CFLAGS_XSAVE="$1"
pgac_xsave_intrinsics=yes
fi
undefine([Ac_cachevar])dnl
@@ -725,29 +725,27 @@ undefine([Ac_cachevar])dnl
# _mm512_setzero_si512, _mm512_maskz_loadu_epi8, _mm512_popcnt_epi64,
# _mm512_add_epi64, and _mm512_reduce_add_epi64 intrinsic functions.
#
-# Optional compiler flags can be passed as argument (e.g., -mavx512vpopcntdq
-# -mavx512bw). If the intrinsics are supported, sets
-# pgac_avx512_popcnt_intrinsics and CFLAGS_POPCNT.
+# If the intrinsics are supported, sets pgac_avx512_popcnt_intrinsics.
AC_DEFUN([PGAC_AVX512_POPCNT_INTRINSICS],
-[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_popcnt_intrinsics_$1])])dnl
-AC_CACHE_CHECK([for _mm512_popcnt_epi64 with CFLAGS=$1], [Ac_cachevar],
-[pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS $1"
-AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>],
- [const char buf@<:@sizeof(__m512i)@:>@;
- PG_INT64_TYPE popcnt = 0;
- __m512i accum = _mm512_setzero_si512();
- const __m512i val = _mm512_maskz_loadu_epi8((__mmask64) 0xf0f0f0f0f0f0f0f0,
(const __m512i *) buf);
- const __m512i cnt = _mm512_popcnt_epi64(val);
- accum = _mm512_add_epi64(accum, cnt);
- popcnt = _mm512_reduce_add_epi64(accum);
- /* return computed value, to prevent the above being optimized away */
- return popcnt == 0;])],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_popcnt_intrinsics])])dnl
+AC_CACHE_CHECK([for _mm512_popcnt_epi64], [Ac_cachevar],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>
+ __attribute__((target("avx512vpopcntdq","avx512bw")))
+ static int popcount_test(void)
+ {
+ const char buf@<:@sizeof(__m512i)@:>@;
+ PG_INT64_TYPE popcnt = 0;
+ __m512i accum = _mm512_setzero_si512();
+ const __m512i val = _mm512_maskz_loadu_epi8((__mmask64)
0xf0f0f0f0f0f0f0f0, (const __m512i *) buf);
+ const __m512i cnt = _mm512_popcnt_epi64(val);
+ accum = _mm512_add_epi64(accum, cnt);
+ popcnt = _mm512_reduce_add_epi64(accum);
+ return (int) popcnt;
+ }],
+ [return popcount_test();])],
[Ac_cachevar=yes],
- [Ac_cachevar=no])
-CFLAGS="$pgac_save_CFLAGS"])
+ [Ac_cachevar=no])])
if test x"$Ac_cachevar" = x"yes"; then
- CFLAGS_POPCNT="$1"
pgac_avx512_popcnt_intrinsics=yes
fi
undefine([Ac_cachevar])dnl
diff --git a/configure b/configure
index 3a577e463b..028f448a29 100755
--- a/configure
+++ b/configure
@@ -647,9 +647,6 @@ MSGFMT_FLAGS
MSGFMT
PG_CRC32C_OBJS
CFLAGS_CRC
-PG_POPCNT_OBJS
-CFLAGS_POPCNT
-CFLAGS_XSAVE
LIBOBJS
OPENSSL
ZSTD
@@ -17270,185 +17267,99 @@ fi
# Check for XSAVE intrinsics
#
-CFLAGS_XSAVE=""
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _xgetbv with CFLAGS=" >&5
-$as_echo_n "checking for _xgetbv with CFLAGS=... " >&6; }
-if ${pgac_cv_xsave_intrinsics_+:} false; then :
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _xgetbv" >&5
+$as_echo_n "checking for _xgetbv... " >&6; }
+if ${pgac_cv_xsave_intrinsics+:} false; then :
$as_echo_n "(cached) " >&6
else
- pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS "
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h. */
-#include <immintrin.h>
-int
-main ()
-{
-return _xgetbv(0) & 0xe0;
- ;
- return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
- pgac_cv_xsave_intrinsics_=yes
-else
- pgac_cv_xsave_intrinsics_=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
- conftest$ac_exeext conftest.$ac_ext
-CFLAGS="$pgac_save_CFLAGS"
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_xsave_intrinsics_"
>&5
-$as_echo "$pgac_cv_xsave_intrinsics_" >&6; }
-if test x"$pgac_cv_xsave_intrinsics_" = x"yes"; then
- CFLAGS_XSAVE=""
- pgac_xsave_intrinsics=yes
-fi
-
-if test x"$pgac_xsave_intrinsics" != x"yes"; then
- { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _xgetbv with
CFLAGS=-mxsave" >&5
-$as_echo_n "checking for _xgetbv with CFLAGS=-mxsave... " >&6; }
-if ${pgac_cv_xsave_intrinsics__mxsave+:} false; then :
- $as_echo_n "(cached) " >&6
-else
- pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS -mxsave"
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
#include <immintrin.h>
+ __attribute__((target("xsave")))
+ static int xsave_test(void)
+ {
+ return _xgetbv(0) & 0xe0;
+ }
int
main ()
{
-return _xgetbv(0) & 0xe0;
+return xsave_test();
;
return 0;
}
_ACEOF
if ac_fn_c_try_link "$LINENO"; then :
- pgac_cv_xsave_intrinsics__mxsave=yes
+ pgac_cv_xsave_intrinsics=yes
else
- pgac_cv_xsave_intrinsics__mxsave=no
+ pgac_cv_xsave_intrinsics=no
fi
rm -f core conftest.err conftest.$ac_objext \
conftest$ac_exeext conftest.$ac_ext
-CFLAGS="$pgac_save_CFLAGS"
fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result:
$pgac_cv_xsave_intrinsics__mxsave" >&5
-$as_echo "$pgac_cv_xsave_intrinsics__mxsave" >&6; }
-if test x"$pgac_cv_xsave_intrinsics__mxsave" = x"yes"; then
- CFLAGS_XSAVE="-mxsave"
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_xsave_intrinsics" >&5
+$as_echo "$pgac_cv_xsave_intrinsics" >&6; }
+if test x"$pgac_cv_xsave_intrinsics" = x"yes"; then
pgac_xsave_intrinsics=yes
fi
-fi
if test x"$pgac_xsave_intrinsics" = x"yes"; then
$as_echo "#define HAVE_XSAVE_INTRINSICS 1" >>confdefs.h
fi
-
# Check for AVX-512 popcount intrinsics
#
-CFLAGS_POPCNT=""
-PG_POPCNT_OBJS=""
if test x"$host_cpu" = x"x86_64"; then
- { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64
with CFLAGS=" >&5
-$as_echo_n "checking for _mm512_popcnt_epi64 with CFLAGS=... " >&6; }
-if ${pgac_cv_avx512_popcnt_intrinsics_+:} false; then :
+ { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64"
>&5
+$as_echo_n "checking for _mm512_popcnt_epi64... " >&6; }
+if ${pgac_cv_avx512_popcnt_intrinsics+:} false; then :
$as_echo_n "(cached) " >&6
else
- pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS "
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h. */
-#include <immintrin.h>
-int
-main ()
-{
-const char buf[sizeof(__m512i)];
- PG_INT64_TYPE popcnt = 0;
- __m512i accum = _mm512_setzero_si512();
- const __m512i val = _mm512_maskz_loadu_epi8((__mmask64) 0xf0f0f0f0f0f0f0f0,
(const __m512i *) buf);
- const __m512i cnt = _mm512_popcnt_epi64(val);
- accum = _mm512_add_epi64(accum, cnt);
- popcnt = _mm512_reduce_add_epi64(accum);
- /* return computed value, to prevent the above being optimized away */
- return popcnt == 0;
- ;
- return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
- pgac_cv_avx512_popcnt_intrinsics_=yes
-else
- pgac_cv_avx512_popcnt_intrinsics_=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
- conftest$ac_exeext conftest.$ac_ext
-CFLAGS="$pgac_save_CFLAGS"
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result:
$pgac_cv_avx512_popcnt_intrinsics_" >&5
-$as_echo "$pgac_cv_avx512_popcnt_intrinsics_" >&6; }
-if test x"$pgac_cv_avx512_popcnt_intrinsics_" = x"yes"; then
- CFLAGS_POPCNT=""
- pgac_avx512_popcnt_intrinsics=yes
-fi
-
- if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then
- { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64
with CFLAGS=-mavx512vpopcntdq -mavx512bw" >&5
-$as_echo_n "checking for _mm512_popcnt_epi64 with CFLAGS=-mavx512vpopcntdq
-mavx512bw... " >&6; }
-if ${pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512bw+:} false;
then :
- $as_echo_n "(cached) " >&6
-else
- pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS -mavx512vpopcntdq -mavx512bw"
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
#include <immintrin.h>
+ __attribute__((target("avx512vpopcntdq","avx512bw")))
+ static int popcount_test(void)
+ {
+ const char buf[sizeof(__m512i)];
+ PG_INT64_TYPE popcnt = 0;
+ __m512i accum = _mm512_setzero_si512();
+ const __m512i val = _mm512_maskz_loadu_epi8((__mmask64)
0xf0f0f0f0f0f0f0f0, (const __m512i *) buf);
+ const __m512i cnt = _mm512_popcnt_epi64(val);
+ accum = _mm512_add_epi64(accum, cnt);
+ popcnt = _mm512_reduce_add_epi64(accum);
+ return (int) popcnt;
+ }
int
main ()
{
-const char buf[sizeof(__m512i)];
- PG_INT64_TYPE popcnt = 0;
- __m512i accum = _mm512_setzero_si512();
- const __m512i val = _mm512_maskz_loadu_epi8((__mmask64) 0xf0f0f0f0f0f0f0f0,
(const __m512i *) buf);
- const __m512i cnt = _mm512_popcnt_epi64(val);
- accum = _mm512_add_epi64(accum, cnt);
- popcnt = _mm512_reduce_add_epi64(accum);
- /* return computed value, to prevent the above being optimized away */
- return popcnt == 0;
+return popcount_test();
;
return 0;
}
_ACEOF
if ac_fn_c_try_link "$LINENO"; then :
- pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512bw=yes
+ pgac_cv_avx512_popcnt_intrinsics=yes
else
- pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512bw=no
+ pgac_cv_avx512_popcnt_intrinsics=no
fi
rm -f core conftest.err conftest.$ac_objext \
conftest$ac_exeext conftest.$ac_ext
-CFLAGS="$pgac_save_CFLAGS"
fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result:
$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512bw" >&5
-$as_echo "$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512bw" >&6;
}
-if test x"$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512bw" =
x"yes"; then
- CFLAGS_POPCNT="-mavx512vpopcntdq -mavx512bw"
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result:
$pgac_cv_avx512_popcnt_intrinsics" >&5
+$as_echo "$pgac_cv_avx512_popcnt_intrinsics" >&6; }
+if test x"$pgac_cv_avx512_popcnt_intrinsics" = x"yes"; then
pgac_avx512_popcnt_intrinsics=yes
fi
- fi
if test x"$pgac_avx512_popcnt_intrinsics" = x"yes"; then
- PG_POPCNT_OBJS="pg_popcount_avx512.o pg_popcount_avx512_choose.o"
$as_echo "#define USE_AVX512_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h
fi
fi
-
-
# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
#
# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
diff --git a/configure.ac b/configure.ac
index 55f6c46d33..8bdb9c576a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2052,32 +2052,19 @@ fi
# Check for XSAVE intrinsics
#
-CFLAGS_XSAVE=""
-PGAC_XSAVE_INTRINSICS([])
-if test x"$pgac_xsave_intrinsics" != x"yes"; then
- PGAC_XSAVE_INTRINSICS([-mxsave])
-fi
+PGAC_XSAVE_INTRINSICS()
if test x"$pgac_xsave_intrinsics" = x"yes"; then
AC_DEFINE(HAVE_XSAVE_INTRINSICS, 1, [Define to 1 if you have XSAVE
intrinsics.])
fi
-AC_SUBST(CFLAGS_XSAVE)
# Check for AVX-512 popcount intrinsics
#
-CFLAGS_POPCNT=""
-PG_POPCNT_OBJS=""
if test x"$host_cpu" = x"x86_64"; then
- PGAC_AVX512_POPCNT_INTRINSICS([])
- if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then
- PGAC_AVX512_POPCNT_INTRINSICS([-mavx512vpopcntdq -mavx512bw])
- fi
+ PGAC_AVX512_POPCNT_INTRINSICS()
if test x"$pgac_avx512_popcnt_intrinsics" = x"yes"; then
- PG_POPCNT_OBJS="pg_popcount_avx512.o pg_popcount_avx512_choose.o"
AC_DEFINE(USE_AVX512_POPCNT_WITH_RUNTIME_CHECK, 1, [Define to 1 to use
AVX-512 popcount instructions with a runtime check.])
fi
fi
-AC_SUBST(CFLAGS_POPCNT)
-AC_SUBST(PG_POPCNT_OBJS)
# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
#
diff --git a/meson.build b/meson.build
index 58e67975e8..9492f9d52e 100644
--- a/meson.build
+++ b/meson.build
@@ -2157,25 +2157,20 @@ endforeach
# Check for the availability of XSAVE intrinsics.
###############################################################
-cflags_xsave = []
if host_cpu == 'x86' or host_cpu == 'x86_64'
prog = '''
#include <immintrin.h>
+__attribute__((target("xsave")))
int main(void)
{
return _xgetbv(0) & 0xe0;
}
'''
- if cc.links(prog, name: 'XSAVE intrinsics without -mxsave',
- args: test_c_args)
+ if cc.links(prog, name: 'XSAVE intrinsics', args: test_c_args)
cdata.set('HAVE_XSAVE_INTRINSICS', 1)
- elif cc.links(prog, name: 'XSAVE intrinsics with -mxsave',
- args: test_c_args + ['-mxsave'])
- cdata.set('HAVE_XSAVE_INTRINSICS', 1)
- cflags_xsave += '-mxsave'
endif
endif
@@ -2185,12 +2180,12 @@ endif
# Check for the availability of AVX-512 popcount intrinsics.
###############################################################
-cflags_popcnt = []
if host_cpu == 'x86_64'
prog = '''
#include <immintrin.h>
+__attribute__((target("avx512vpopcntdq","avx512bw")))
int main(void)
{
const char buf[sizeof(__m512i)];
@@ -2205,13 +2200,9 @@ int main(void)
}
'''
- if cc.links(prog, name: 'AVX-512 popcount without -mavx512vpopcntdq
-mavx512bw',
+ if cc.links(prog, name: 'AVX-512 popcount',
args: test_c_args + ['-DINT64=@0@'.format(cdata.get('PG_INT64_TYPE'))])
cdata.set('USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 1)
- elif cc.links(prog, name: 'AVX-512 popcount with -mavx512vpopcntdq
-mavx512bw',
- args: test_c_args + ['-DINT64=@0@'.format(cdata.get('PG_INT64_TYPE'))]
+ ['-mavx512vpopcntdq'] + ['-mavx512bw'])
- cdata.set('USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 1)
- cflags_popcnt += ['-mavx512vpopcntdq'] + ['-mavx512bw']
endif
endif
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 42f50b4976..45696247e9 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -262,9 +262,7 @@ CFLAGS_SL_MODULE = @CFLAGS_SL_MODULE@
CXXFLAGS_SL_MODULE = @CXXFLAGS_SL_MODULE@
CFLAGS_UNROLL_LOOPS = @CFLAGS_UNROLL_LOOPS@
CFLAGS_VECTORIZE = @CFLAGS_VECTORIZE@
-CFLAGS_POPCNT = @CFLAGS_POPCNT@
CFLAGS_CRC = @CFLAGS_CRC@
-CFLAGS_XSAVE = @CFLAGS_XSAVE@
PERMIT_DECLARATION_AFTER_STATEMENT = @PERMIT_DECLARATION_AFTER_STATEMENT@
PERMIT_MISSING_VARIABLE_DECLARATIONS = @PERMIT_MISSING_VARIABLE_DECLARATIONS@
CXXFLAGS = @CXXFLAGS@
@@ -762,9 +760,6 @@ LIBOBJS = @LIBOBJS@
# files needed for the chosen CRC-32C implementation
PG_CRC32C_OBJS = @PG_CRC32C_OBJS@
-# files needed for the chosen popcount implementation
-PG_POPCNT_OBJS = @PG_POPCNT_OBJS@
-
LIBS := -lpgcommon -lpgport $(LIBS)
# to make ws2_32.lib the last library
diff --git a/src/include/c.h b/src/include/c.h
index 55dec71a6d..6f5ca25542 100644
--- a/src/include/c.h
+++ b/src/include/c.h
@@ -174,6 +174,16 @@
#define pg_attribute_nonnull(...)
#endif
+/*
+ * pg_attribute_target allows specifying different target options that the
+ * function should be compiled with (e.g., for using special CPU instructions).
+ */
+#if __has_attribute (target)
+#define pg_attribute_target(...) __attribute__((target(__VA_ARGS__)))
+#else
+#define pg_attribute_target(...)
+#endif
+
/*
* Append PG_USED_FOR_ASSERTS_ONLY to definitions of variables that are only
* used in assert-enabled builds, to avoid compiler warnings about unused
diff --git a/src/makefiles/meson.build b/src/makefiles/meson.build
index 850e927584..479aa08420 100644
--- a/src/makefiles/meson.build
+++ b/src/makefiles/meson.build
@@ -102,10 +102,8 @@ pgxs_kv = {
' '.join(cflags_no_missing_var_decls),
'CFLAGS_CRC': ' '.join(cflags_crc),
- 'CFLAGS_POPCNT': ' '.join(cflags_popcnt),
'CFLAGS_UNROLL_LOOPS': ' '.join(unroll_loops_cflags),
'CFLAGS_VECTORIZE': ' '.join(vectorize_cflags),
- 'CFLAGS_XSAVE': ' '.join(cflags_xsave),
'LDFLAGS': var_ldflags,
'LDFLAGS_EX': var_ldflags_ex,
@@ -181,7 +179,7 @@ pgxs_empty = [
'WANTED_LANGUAGES',
# Not needed because we don't build the server / PLs with the generated
makefile
- 'LIBOBJS', 'PG_CRC32C_OBJS', 'PG_POPCNT_OBJS', 'TAS',
+ 'LIBOBJS', 'PG_CRC32C_OBJS', 'TAS',
'DTRACEFLAGS', # only server has dtrace probes
'perl_archlibexp', 'perl_embed_ccflags', 'perl_embed_ldflags',
'perl_includespec', 'perl_privlibexp',
diff --git a/src/port/Makefile b/src/port/Makefile
index 9324ec2d9f..366c814bd9 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -38,13 +38,13 @@ LIBS += $(PTHREAD_LIBS)
OBJS = \
$(LIBOBJS) \
$(PG_CRC32C_OBJS) \
- $(PG_POPCNT_OBJS) \
bsearch_arg.o \
chklocale.o \
inet_net_ntop.o \
noblock.o \
path.o \
pg_bitutils.o \
+ pg_popcount_avx512.o \
pg_strong_random.o \
pgcheckdir.o \
pgmkdirp.o \
@@ -92,16 +92,6 @@ pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC)
pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC)
pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_CRC)
-# all versions of pg_popcount_avx512_choose.o need CFLAGS_XSAVE
-pg_popcount_avx512_choose.o: CFLAGS+=$(CFLAGS_XSAVE)
-pg_popcount_avx512_choose_shlib.o: CFLAGS+=$(CFLAGS_XSAVE)
-pg_popcount_avx512_choose_srv.o: CFLAGS+=$(CFLAGS_XSAVE)
-
-# all versions of pg_popcount_avx512.o need CFLAGS_POPCNT
-pg_popcount_avx512.o: CFLAGS+=$(CFLAGS_POPCNT)
-pg_popcount_avx512_shlib.o: CFLAGS+=$(CFLAGS_POPCNT)
-pg_popcount_avx512_srv.o: CFLAGS+=$(CFLAGS_POPCNT)
-
#
# Shared library versions of object files
#
diff --git a/src/port/meson.build b/src/port/meson.build
index 1150966ab7..83a0632520 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -7,6 +7,7 @@ pgport_sources = [
'noblock.c',
'path.c',
'pg_bitutils.c',
+ 'pg_popcount_avx512.c',
'pg_strong_random.c',
'pgcheckdir.c',
'pgmkdirp.c',
@@ -84,8 +85,6 @@ replace_funcs_pos = [
['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
- ['pg_popcount_avx512', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 'popcnt'],
- ['pg_popcount_avx512_choose', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK',
'xsave'],
# arm / aarch64
['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'],
@@ -100,8 +99,8 @@ replace_funcs_pos = [
['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'],
]
-pgport_cflags = {'crc': cflags_crc, 'popcnt': cflags_popcnt, 'xsave':
cflags_xsave}
-pgport_sources_cflags = {'crc': [], 'popcnt': [], 'xsave': []}
+pgport_cflags = {'crc': cflags_crc}
+pgport_sources_cflags = {'crc': []}
foreach f : replace_funcs_neg
func = f.get(0)
diff --git a/src/port/pg_popcount_avx512.c b/src/port/pg_popcount_avx512.c
index 9d3149e2d0..b598e86554 100644
--- a/src/port/pg_popcount_avx512.c
+++ b/src/port/pg_popcount_avx512.c
@@ -12,7 +12,17 @@
*/
#include "c.h"
+#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
+#include <cpuid.h>
+#endif
+
+#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
#include <immintrin.h>
+#endif
+
+#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
+#include <intrin.h>
+#endif
#include "port/pg_bitutils.h"
@@ -21,12 +31,82 @@
* use AVX-512 intrinsics, but we check it anyway to be sure. We piggy-back on
* the function pointers that are only used when TRY_POPCNT_FAST is set.
*/
-#ifdef TRY_POPCNT_FAST
+#if defined(TRY_POPCNT_FAST) && defined(USE_AVX512_POPCNT_WITH_RUNTIME_CHECK)
+
+/*
+ * Does CPUID say there's support for XSAVE instructions?
+ */
+static inline bool
+xsave_available(void)
+{
+ unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+ __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+ __cpuid(exx, 1);
+#else
+#error cpuid instruction not available
+#endif
+ return (exx[2] & (1 << 27)) != 0; /* osxsave */
+}
+
+/*
+ * Does XGETBV say the ZMM registers are enabled?
+ *
+ * NB: Caller is responsible for verifying that xsave_available() returns true
+ * before calling this.
+ */
+#ifdef HAVE_XSAVE_INTRINSICS
+pg_attribute_target("xsave")
+#endif
+static inline bool
+zmm_regs_available(void)
+{
+#ifdef HAVE_XSAVE_INTRINSICS
+ return (_xgetbv(0) & 0xe6) == 0xe6;
+#else
+ return false;
+#endif
+}
+
+/*
+ * Does CPUID say there's support for AVX-512 popcount and byte-and-word
+ * instructions?
+ */
+static inline bool
+avx512_popcnt_available(void)
+{
+ unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID_COUNT)
+ __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUIDEX)
+ __cpuidex(exx, 7, 0);
+#else
+#error cpuid instruction not available
+#endif
+ return (exx[2] & (1 << 14)) != 0 && /* avx512-vpopcntdq */
+ (exx[1] & (1 << 30)) != 0; /* avx512-bw */
+}
+
+/*
+ * Returns true if the CPU supports the instructions required for the AVX-512
+ * pg_popcount() implementation.
+ */
+bool
+pg_popcount_avx512_available(void)
+{
+ return xsave_available() &&
+ zmm_regs_available() &&
+ avx512_popcnt_available();
+}
/*
* pg_popcount_avx512
* Returns the number of 1-bits in buf
*/
+pg_attribute_target("avx512vpopcntdq", "avx512bw")
uint64
pg_popcount_avx512(const char *buf, int bytes)
{
@@ -82,6 +162,7 @@ pg_popcount_avx512(const char *buf, int bytes)
* pg_popcount_masked_avx512
* Returns the number of 1-bits in buf after applying the mask to
each byte
*/
+pg_attribute_target("avx512vpopcntdq", "avx512bw")
uint64
pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask)
{
@@ -138,4 +219,5 @@ pg_popcount_masked_avx512(const char *buf, int bytes, bits8
mask)
return _mm512_reduce_add_epi64(accum);
}
-#endif /* TRY_POPCNT_FAST */
+#endif /* TRY_POPCNT_FAST &&
+ *
USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
diff --git a/src/port/pg_popcount_avx512_choose.c
b/src/port/pg_popcount_avx512_choose.c
deleted file mode 100644
index b37107803a..0000000000
--- a/src/port/pg_popcount_avx512_choose.c
+++ /dev/null
@@ -1,102 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * pg_popcount_avx512_choose.c
- * Test whether we can use the AVX-512 pg_popcount() implementation.
- *
- * Copyright (c) 2024, PostgreSQL Global Development Group
- *
- * IDENTIFICATION
- * src/port/pg_popcount_avx512_choose.c
- *
- *-------------------------------------------------------------------------
- */
-#include "c.h"
-
-#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
-#include <cpuid.h>
-#endif
-
-#ifdef HAVE_XSAVE_INTRINSICS
-#include <immintrin.h>
-#endif
-
-#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
-#include <intrin.h>
-#endif
-
-#include "port/pg_bitutils.h"
-
-/*
- * It's probably unlikely that TRY_POPCNT_FAST won't be set if we are able to
- * use AVX-512 intrinsics, but we check it anyway to be sure. We piggy-back on
- * the function pointers that are only used when TRY_POPCNT_FAST is set.
- */
-#ifdef TRY_POPCNT_FAST
-
-/*
- * Does CPUID say there's support for XSAVE instructions?
- */
-static inline bool
-xsave_available(void)
-{
- unsigned int exx[4] = {0, 0, 0, 0};
-
-#if defined(HAVE__GET_CPUID)
- __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
-#elif defined(HAVE__CPUID)
- __cpuid(exx, 1);
-#else
-#error cpuid instruction not available
-#endif
- return (exx[2] & (1 << 27)) != 0; /* osxsave */
-}
-
-/*
- * Does XGETBV say the ZMM registers are enabled?
- *
- * NB: Caller is responsible for verifying that xsave_available() returns true
- * before calling this.
- */
-static inline bool
-zmm_regs_available(void)
-{
-#ifdef HAVE_XSAVE_INTRINSICS
- return (_xgetbv(0) & 0xe6) == 0xe6;
-#else
- return false;
-#endif
-}
-
-/*
- * Does CPUID say there's support for AVX-512 popcount and byte-and-word
- * instructions?
- */
-static inline bool
-avx512_popcnt_available(void)
-{
- unsigned int exx[4] = {0, 0, 0, 0};
-
-#if defined(HAVE__GET_CPUID_COUNT)
- __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
-#elif defined(HAVE__CPUIDEX)
- __cpuidex(exx, 7, 0);
-#else
-#error cpuid instruction not available
-#endif
- return (exx[2] & (1 << 14)) != 0 && /* avx512-vpopcntdq */
- (exx[1] & (1 << 30)) != 0; /* avx512-bw */
-}
-
-/*
- * Returns true if the CPU supports the instructions required for the AVX-512
- * pg_popcount() implementation.
- */
-bool
-pg_popcount_avx512_available(void)
-{
- return xsave_available() &&
- zmm_regs_available() &&
- avx512_popcnt_available();
-}
-
-#endif /* TRY_POPCNT_FAST */
--
2.39.5 (Apple Git-154)