On Tue, Mar 11, 2025 at 4:47 AM Nathan Bossart <nathandboss...@gmail.com> wrote: > > On Mon, Mar 10, 2025 at 03:48:31PM +0700, John Naylor wrote: > > On Tue, Mar 4, 2025 at 2:11 AM Nathan Bossart <nathandboss...@gmail.com> > > wrote: > >> Overall, I wish we could avoid splitting things into separate files and > >> adding more header file gymnastics, but maybe there isn't much better we > >> can do without overhauling the CPU feature detection code. > > > > I wanted to make an attempt to make this aspect nicer. v13-0002 > > incorporates deliberately compact and simple loops for inlined > > constant input into the dispatch function, and leaves the existing > > code alone. This avoids code churn and saves vertical space in the > > copied code. It needs a bit more commentary, but I hope this is a more > > digestible prerequisite to the CLMUL algorithm -- as a reminder, it'll > > be simpler if we can always assume non-constant input can go through a > > function pointer. > > That is certainly more readable. FWIW I think it would be entirely > reasonable to replace the pg_crc32c_sse42.c implementation with a call to > this new pg_comp_crc32c_dispatch() function. Of course, you'd have to > split things up like: > [snip]
That could work as well. I'm thinking if we do PMULL on Arm, it might be advantageous to keep the inline path and function paths with distinct coding -- because of the pickier alignment on that platform, it might not be worth pre-aligning the pointer to 8 bytes for a 20-byte constant input. I've gone ahead and added the generated AVX-512 algorithm in v14-0005, and added the build support and some of the runtime support from Paul and Raghuveer's earlier patches in 0006-7. It passes CI, but I'll have to arrange access to other hardware to verify the runtime behavior. I think the Meson support is most of the way there, but it looks like configure.ac got whacked around cosmetically quite a bit. If we feel it's time to refactor things there, we'll want to split that out. In any case, for autoconf I've pretty much kept the earlier work for now. -- John Naylor Amazon Web Services
From adfe02a6d169be865937b567bc1b2b2ffde60631 Mon Sep 17 00:00:00 2001 From: John Naylor <john.nay...@postgresql.org> Date: Tue, 11 Mar 2025 11:20:20 +0700 Subject: [PATCH v14 4/8] Always do runtime check for x86 to simplify PCLMUL --- configure | 2 +- configure.ac | 2 +- src/include/port/pg_crc32c.h | 20 ++++++++++++++------ src/port/meson.build | 1 + src/port/pg_crc32c_sse42.c | 2 +- src/port/pg_crc32c_sse42_choose.c | 2 ++ 6 files changed, 20 insertions(+), 9 deletions(-) diff --git a/configure b/configure index 93fddd69981..91c0ffc8272 100755 --- a/configure +++ b/configure @@ -17684,7 +17684,7 @@ if test x"$USE_SSE42_CRC32C" = x"1"; then $as_echo "#define USE_SSE42_CRC32C 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_sse42.o" + PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o" { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5 $as_echo "SSE 4.2" >&6; } else diff --git a/configure.ac b/configure.ac index b6d02f5ecc7..a85bdbd4ff6 100644 --- a/configure.ac +++ b/configure.ac @@ -2151,7 +2151,7 @@ fi AC_MSG_CHECKING([which CRC-32C implementation to use]) if test x"$USE_SSE42_CRC32C" = x"1"; then AC_DEFINE(USE_SSE42_CRC32C, 1, [Define to 1 use Intel SSE 4.2 CRC instructions.]) - PG_CRC32C_OBJS="pg_crc32c_sse42.o" + PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o" AC_MSG_RESULT(SSE 4.2) else if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h index 229f4f6a65a..28253b48018 100644 --- a/src/include/port/pg_crc32c.h +++ b/src/include/port/pg_crc32c.h @@ -47,7 +47,10 @@ typedef uint32 pg_crc32c; #define EQ_CRC32C(c1, c2) ((c1) == (c2)) #if defined(USE_SSE42_CRC32C) -/* Use Intel SSE4.2 instructions. */ +/* + * Use either Intel SSE 4.2 or PCLMUL instructions. We don't need a runtime check + * for SSE 4.2, so we can inline those in some cases. + */ #include <nmmintrin.h> @@ -55,7 +58,11 @@ typedef uint32 pg_crc32c; ((crc) = pg_comp_crc32c_dispatch((crc), (data), (len))) #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF) +extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len); extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len); +#ifdef USE_PCLMUL_WITH_RUNTIME_CHECK +extern pg_crc32c pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t len); +#endif pg_attribute_no_sanitize_alignment() static inline @@ -67,9 +74,9 @@ pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len) const unsigned char *p = data; /* - * For constant inputs, inline the computation to avoid the - * indirect function call. This also allows the compiler to unroll - * loops for small inputs. + * For constant inputs, inline the computation to avoid the indirect + * function call. This also allows the compiler to unroll loops for + * small inputs. */ #if SIZEOF_VOID_P >= 8 for (; len >= 8; p += 8, len -= 8) @@ -82,7 +89,8 @@ pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len) return crc; } else - return pg_comp_crc32c_sse42(crc, data, len); + /* Otherwise, use a runtime check for PCLMUL instructions. */ + return pg_comp_crc32c(crc, data, len); } #elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) @@ -123,7 +131,7 @@ extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_ #elif defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK) /* - * Use Intel SSE 4.2 or ARMv8 instructions, but perform a runtime check first + * Use ARMv8 instructions, but perform a runtime check first * to check that they are available. */ #define COMP_CRC32C(crc, data, len) \ diff --git a/src/port/meson.build b/src/port/meson.build index 7fcfa728d43..8d70a4d510e 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -83,6 +83,7 @@ replace_funcs_pos = [ # x86/x64 ['pg_crc32c_sse42', 'USE_SSE42_CRC32C'], ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], + ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C'], ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c index 2001e69850b..c57d6c6293b 100644 --- a/src/port/pg_crc32c_sse42.c +++ b/src/port/pg_crc32c_sse42.c @@ -152,7 +152,7 @@ pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t length) len = end - buf; } - return pg_comp_crc32c_sse42_inline(crc0, buf, len); + return pg_comp_crc32c_sse42(crc0, buf, len); } #endif diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_crc32c_sse42_choose.c index abea0f90eb3..89a48c76894 100644 --- a/src/port/pg_crc32c_sse42_choose.c +++ b/src/port/pg_crc32c_sse42_choose.c @@ -55,8 +55,10 @@ pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) pg_comp_crc32c = pg_comp_crc32c_pclmul; #endif } +#ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK else pg_comp_crc32c = pg_comp_crc32c_sb8; +#endif return pg_comp_crc32c(crc, data, len); } -- 2.48.1
From 61fe19c116f2593757eedd391ca5e9e80c543aee Mon Sep 17 00:00:00 2001 From: John Naylor <john.nay...@postgresql.org> Date: Tue, 11 Mar 2025 16:58:10 +0700 Subject: [PATCH v14 8/8] Temp fixup: build of benchmark on Windows --- src/include/port/pg_crc32c.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h index a45f56a9405..ee3245c2042 100644 --- a/src/include/port/pg_crc32c.h +++ b/src/include/port/pg_crc32c.h @@ -99,7 +99,7 @@ pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len) #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF) extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len); -extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len); +extern PGDLLIMPORT pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len); extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len); #ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK extern pg_crc32c pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t len); -- 2.48.1
From 5c07fe6c3ecacf2cbcc2c3e081a1bfb2a2fc259b Mon Sep 17 00:00:00 2001 From: John Naylor <john.nay...@postgresql.org> Date: Tue, 11 Mar 2025 14:57:01 +0700 Subject: [PATCH v14 7/8] AVX-512 CRC / autoconf Author: Raghuveer Devulapalli <raghuveer.devulapa...@intel.com> Author: Paul Amonson <paul.d.amon...@intel.com> --- config/c-compiler.m4 | 30 +++++++++ configure | 151 ++++++++++++++++++++++++++----------------- configure.ac | 104 +++++++++++++---------------- 3 files changed, 164 insertions(+), 121 deletions(-) diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 8534cc54c13..f172f260e4e 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -577,6 +577,36 @@ fi undefine([Ac_cachevar])dnl ])# PGAC_SSE42_CRC32_INTRINSICS +# PGAC_AVX512_CRC32_INTRINSICS +# --------------------------- +# Check if the compiler supports the x86 CRC instructions added in AVX-512, +# using intrinsics with function __attribute__((target("..."))): + +AC_DEFUN([PGAC_AVX512_CRC32_INTRINSICS], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_crc32_intrinsics])])dnl +AC_CACHE_CHECK([for _mm512_clmulepi64_epi128 with function attribute], [Ac_cachevar], +[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h> + #include <stdint.h> + #if defined(__has_attribute) && __has_attribute (target) + __attribute__((target("avx512vl,vpclmulqdq"))) + #endif + static int crc32_avx512_test(void) + { + __m512i x0 = _mm512_set1_epi32(0x1); + __m512i x1 = _mm512_set1_epi32(0x2); + __m512i x2 = _mm512_clmulepi64_epi128(x1, x0, 0x00); // vpclmulqdq + __m128i a1 = _mm_xor_epi64(_mm512_castsi512_si128(x1), _mm512_castsi512_si128(x0)); //avx512vl + int64_t val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0)); // 64-bit instruction + return (uint32_t)_mm_crc32_u64(val, _mm_extract_epi64(a1, 1)); + }], + [return crc32_avx512_test();])], + [Ac_cachevar=yes], + [Ac_cachevar=no])]) +if test x"$Ac_cachevar" = x"yes"; then + pgac_avx512_crc32_intrinsics=yes +fi +undefine([Ac_cachevar])dnl +])# PGAC_AVX512_CRC32_INTRINSICS # PGAC_ARMV8_CRC32C_INTRINSICS # ---------------------------- diff --git a/configure b/configure index 91c0ffc8272..a7c3d56f9f2 100755 --- a/configure +++ b/configure @@ -17381,7 +17381,7 @@ $as_echo "#define USE_AVX512_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h fi fi -# Check for Intel SSE 4.2 intrinsics to do CRC calculations. +# Check for Intel SSE 4.2 and AVX-512 intrinsics to do CRC calculations. # { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32" >&5 $as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32... " >&6; } @@ -17425,6 +17425,52 @@ if test x"$pgac_cv_sse42_crc32_intrinsics" = x"yes"; then fi +# Check if the _mm512_clmulepi64_epi128 and _mm_xor_epi64 can be used with with +# the __attribute__((target("avx512vl,vpclmulqdq"))). +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_clmulepi64_epi128 with function attribute" >&5 +$as_echo_n "checking for _mm512_clmulepi64_epi128 with function attribute... " >&6; } +if ${pgac_cv_avx512_crc32_intrinsics+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include <immintrin.h> + #include <stdint.h> + #if defined(__has_attribute) && __has_attribute (target) + __attribute__((target("avx512vl,vpclmulqdq"))) + #endif + static int crc32_avx512_test(void) + { + __m512i x0 = _mm512_set1_epi32(0x1); + __m512i x1 = _mm512_set1_epi32(0x2); + __m512i x2 = _mm512_clmulepi64_epi128(x1, x0, 0x00); // vpclmulqdq + __m128i a1 = _mm_xor_epi64(_mm512_castsi512_si128(x1), _mm512_castsi512_si128(x0)); //avx512vl + int64_t val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0)); // 64-bit instruction + return (uint32_t)_mm_crc32_u64(val, _mm_extract_epi64(a1, 1)); + } +int +main () +{ +return crc32_avx512_test(); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_avx512_crc32_intrinsics=yes +else + pgac_cv_avx512_crc32_intrinsics=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_crc32_intrinsics" >&5 +$as_echo "$pgac_cv_avx512_crc32_intrinsics" >&6; } +if test x"$pgac_cv_avx512_crc32_intrinsics" = x"yes"; then + pgac_avx512_crc32_intrinsics=yes +fi + + # Are we targeting a processor that supports SSE 4.2? gcc, clang and icc all # define __SSE4_2__ in that case. cat confdefs.h - <<_ACEOF >conftest.$ac_ext @@ -17626,9 +17672,8 @@ fi # If we are targeting a processor that has Intel SSE 4.2 instructions, we can # use the special CRC instructions for calculating CRC-32C. If we're not # targeting such a processor, but we can nevertheless produce code that uses -# the SSE intrinsics, compile both implementations and select which one to use -# at runtime, depending on whether SSE 4.2 is supported by the processor we're -# running on. +# the SSE/AVX-512 intrinsics compile both implementations and select which one +# to use at runtime, depending runtime cpuid information. # # Similarly, if we are targeting an ARM processor that has the CRC # instructions that are part of the ARMv8 CRC Extension, use them. And if @@ -17645,88 +17690,72 @@ fi # # If we are targeting a LoongArch processor, CRC instructions are # always available (at least on 64 bit), so no runtime check is needed. -if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then - # Use Intel SSE 4.2 if available. - if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then - USE_SSE42_CRC32C=1 - else - # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for - # the runtime check. - if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then - USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1 - else - # Use ARM CRC Extension if available. - if test x"$pgac_armv8_crc32c_intrinsics" = x"yes" && test x"$CFLAGS_CRC" = x""; then - USE_ARMV8_CRC32C=1 - else - # ARM CRC Extension, with runtime check? - if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then - USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1 - else - # LoongArch CRCC instructions. - if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then - USE_LOONGARCH_CRC32C=1 - else - # fall back to slicing-by-8 algorithm, which doesn't require any - # special CPU support. - USE_SLICING_BY_8_CRC32C=1 - fi - fi - fi - fi - fi -fi -# Set PG_CRC32C_OBJS appropriately depending on the selected implementation. { $as_echo "$as_me:${as_lineno-$LINENO}: checking which CRC-32C implementation to use" >&5 $as_echo_n "checking which CRC-32C implementation to use... " >&6; } -if test x"$USE_SSE42_CRC32C" = x"1"; then +if test x"$host_cpu" = x"x86_64"; then + #x86 only: + PG_CRC32C_OBJS="pg_crc32c_sb8.o pg_crc32c_sse42_choose.o" + if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then $as_echo "#define USE_SSE42_CRC32C 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o" - { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5 -$as_echo "SSE 4.2" >&6; } -else - if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then + PG_CRC32C_OBJS+=" pg_crc32c_sse42.o pg_crc32c_sse42_choose.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: CRC32C baseline feature SSE 4.2" >&5 +$as_echo "CRC32C baseline feature SSE 4.2" >&6; } + else + if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then $as_echo "#define USE_SSE42_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_crc32c_sse42_choose.o" - { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5 -$as_echo "SSE 4.2 with runtime check" >&6; } - else - if test x"$USE_ARMV8_CRC32C" = x"1"; then + PG_CRC32C_OBJS+=" pg_crc32c_sse42.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: CRC32C SSE42 with runtime check" >&5 +$as_echo "CRC32C SSE42 with runtime check" >&6; } + fi + fi + if test x"$pgac_avx512_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then + +$as_echo "#define USE_AVX512_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h + + { $as_echo "$as_me:${as_lineno-$LINENO}: result: CRC32C AVX-512 with runtime check" >&5 +$as_echo "CRC32C AVX-512 with runtime check" >&6; } + fi +else + # non x86 code: + # Use ARM CRC Extension if available. + if test x"$pgac_armv8_crc32c_intrinsics" = x"yes" && test x"$CFLAGS_CRC" = x""; then $as_echo "#define USE_ARMV8_CRC32C 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_armv8.o" - { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions" >&5 + PG_CRC32C_OBJS="pg_crc32c_armv8.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions" >&5 $as_echo "ARMv8 CRC instructions" >&6; } - else - if test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then + else + # ARM CRC Extension, with runtime check? + if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then $as_echo "#define USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o" - { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions with runtime check" >&5 + PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions with runtime check" >&5 $as_echo "ARMv8 CRC instructions with runtime check" >&6; } - else - if test x"$USE_LOONGARCH_CRC32C" = x"1"; then + else + if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then $as_echo "#define USE_LOONGARCH_CRC32C 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_loongarch.o" - { $as_echo "$as_me:${as_lineno-$LINENO}: result: LoongArch CRCC instructions" >&5 + PG_CRC32C_OBJS="pg_crc32c_loongarch.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: LoongArch CRCC instructions" >&5 $as_echo "LoongArch CRCC instructions" >&6; } - else + else + # fall back to slicing-by-8 algorithm, which doesn't require any + # special CPU support. $as_echo "#define USE_SLICING_BY_8_CRC32C 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_sb8.o" - { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5 + PG_CRC32C_OBJS="pg_crc32c_sb8.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5 $as_echo "slicing-by-8" >&6; } - fi fi fi fi diff --git a/configure.ac b/configure.ac index a85bdbd4ff6..ee8b225ed87 100644 --- a/configure.ac +++ b/configure.ac @@ -2057,10 +2057,14 @@ if test x"$host_cpu" = x"x86_64"; then fi fi -# Check for Intel SSE 4.2 intrinsics to do CRC calculations. +# Check for Intel SSE 4.2 and AVX-512 intrinsics to do CRC calculations. # PGAC_SSE42_CRC32_INTRINSICS() +# Check if the _mm512_clmulepi64_epi128 and _mm_xor_epi64 can be used with with +# the __attribute__((target("avx512vl,vpclmulqdq"))). +PGAC_AVX512_CRC32_INTRINSICS([]) + # Are we targeting a processor that supports SSE 4.2? gcc, clang and icc all # define __SSE4_2__ in that case. AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [ @@ -2096,9 +2100,8 @@ AC_SUBST(CFLAGS_CRC) # If we are targeting a processor that has Intel SSE 4.2 instructions, we can # use the special CRC instructions for calculating CRC-32C. If we're not # targeting such a processor, but we can nevertheless produce code that uses -# the SSE intrinsics, compile both implementations and select which one to use -# at runtime, depending on whether SSE 4.2 is supported by the processor we're -# running on. +# the SSE/AVX-512 intrinsics compile both implementations and select which one +# to use at runtime, depending runtime cpuid information. # # Similarly, if we are targeting an ARM processor that has the CRC # instructions that are part of the ARMv8 CRC Extension, use them. And if @@ -2115,69 +2118,50 @@ AC_SUBST(CFLAGS_CRC) # # If we are targeting a LoongArch processor, CRC instructions are # always available (at least on 64 bit), so no runtime check is needed. -if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then - # Use Intel SSE 4.2 if available. - if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then - USE_SSE42_CRC32C=1 - else - # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for - # the runtime check. - if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then - USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1 + +AC_MSG_CHECKING([which CRC-32C implementation to use]) +if test x"$host_cpu" = x"x86_64"; then + #x86 only: + PG_CRC32C_OBJS="pg_crc32c_sb8.o pg_crc32c_sse42_choose.o" + if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then + AC_DEFINE(USE_SSE42_CRC32C, 1, [Define to 1 use Intel SSE 4.2 CRC instructions.]) + PG_CRC32C_OBJS+=" pg_crc32c_sse42.o pg_crc32c_sse42_choose.o" + AC_MSG_RESULT(CRC32C baseline feature SSE 4.2) else - # Use ARM CRC Extension if available. - if test x"$pgac_armv8_crc32c_intrinsics" = x"yes" && test x"$CFLAGS_CRC" = x""; then - USE_ARMV8_CRC32C=1 - else - # ARM CRC Extension, with runtime check? - if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then - USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1 - else - # LoongArch CRCC instructions. - if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then - USE_LOONGARCH_CRC32C=1 - else - # fall back to slicing-by-8 algorithm, which doesn't require any - # special CPU support. - USE_SLICING_BY_8_CRC32C=1 - fi + if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then + AC_DEFINE(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check.]) + PG_CRC32C_OBJS+=" pg_crc32c_sse42.o" + AC_MSG_RESULT(CRC32C SSE42 with runtime check) fi - fi fi - fi -fi - -# Set PG_CRC32C_OBJS appropriately depending on the selected implementation. -AC_MSG_CHECKING([which CRC-32C implementation to use]) -if test x"$USE_SSE42_CRC32C" = x"1"; then - AC_DEFINE(USE_SSE42_CRC32C, 1, [Define to 1 use Intel SSE 4.2 CRC instructions.]) - PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o" - AC_MSG_RESULT(SSE 4.2) + if test x"$pgac_avx512_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then + AC_DEFINE(USE_AVX512_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel AVX 512 CRC instructions with a runtime check.]) + AC_MSG_RESULT(CRC32C AVX-512 with runtime check) + fi else - if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then - AC_DEFINE(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check.]) - PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_crc32c_sse42_choose.o" - AC_MSG_RESULT(SSE 4.2 with runtime check) + # non x86 code: + # Use ARM CRC Extension if available. + if test x"$pgac_armv8_crc32c_intrinsics" = x"yes" && test x"$CFLAGS_CRC" = x""; then + AC_DEFINE(USE_ARMV8_CRC32C, 1, [Define to 1 to use ARMv8 CRC Extension.]) + PG_CRC32C_OBJS="pg_crc32c_armv8.o" + AC_MSG_RESULT(ARMv8 CRC instructions) else - if test x"$USE_ARMV8_CRC32C" = x"1"; then - AC_DEFINE(USE_ARMV8_CRC32C, 1, [Define to 1 to use ARMv8 CRC Extension.]) - PG_CRC32C_OBJS="pg_crc32c_armv8.o" - AC_MSG_RESULT(ARMv8 CRC instructions) + # ARM CRC Extension, with runtime check? + if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then + AC_DEFINE(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use ARMv8 CRC Extension with a runtime check.]) + PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o" + AC_MSG_RESULT(ARMv8 CRC instructions with runtime check) else - if test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then - AC_DEFINE(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use ARMv8 CRC Extension with a runtime check.]) - PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o" - AC_MSG_RESULT(ARMv8 CRC instructions with runtime check) + if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then + AC_DEFINE(USE_LOONGARCH_CRC32C, 1, [Define to 1 to use LoongArch CRCC instructions.]) + PG_CRC32C_OBJS="pg_crc32c_loongarch.o" + AC_MSG_RESULT(LoongArch CRCC instructions) else - if test x"$USE_LOONGARCH_CRC32C" = x"1"; then - AC_DEFINE(USE_LOONGARCH_CRC32C, 1, [Define to 1 to use LoongArch CRCC instructions.]) - PG_CRC32C_OBJS="pg_crc32c_loongarch.o" - AC_MSG_RESULT(LoongArch CRCC instructions) - else - AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).]) - PG_CRC32C_OBJS="pg_crc32c_sb8.o" - AC_MSG_RESULT(slicing-by-8) - fi + # fall back to slicing-by-8 algorithm, which doesn't require any + # special CPU support. + AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).]) + PG_CRC32C_OBJS="pg_crc32c_sb8.o" + AC_MSG_RESULT(slicing-by-8) fi fi fi -- 2.48.1
From 85970737d58d3fb46e954f4b056a8411c0870882 Mon Sep 17 00:00:00 2001 From: John Naylor <john.nay...@postgresql.org> Date: Tue, 11 Mar 2025 13:07:01 +0700 Subject: [PATCH v14 5/8] Add runtime support for AVX-512 CRC --- src/port/pg_crc32c_sse42_choose.c | 59 ++++++++++++++++++++++++++----- 1 file changed, 50 insertions(+), 9 deletions(-) diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_crc32c_sse42_choose.c index 89a48c76894..c2d25253c2c 100644 --- a/src/port/pg_crc32c_sse42_choose.c +++ b/src/port/pg_crc32c_sse42_choose.c @@ -20,16 +20,37 @@ #include "c.h" -#ifdef HAVE__GET_CPUID +#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT) #include <cpuid.h> #endif -#ifdef HAVE__CPUID +#include <immintrin.h> + +#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX) #include <intrin.h> #endif #include "port/pg_crc32c.h" +/* + * Does XGETBV say the ZMM registers are enabled? + * + * NB: Caller is responsible for verifying that osxsave is available + * before calling this. + */ +#ifdef HAVE_XSAVE_INTRINSICS +pg_attribute_target("xsave") +#endif +static inline bool +zmm_regs_available(void) +{ +#ifdef HAVE_XSAVE_INTRINSICS + return (_xgetbv(0) & 0xe6) == 0xe6; +#else + return false; +#endif +} + /* * This gets called on the first call. It replaces the function pointer * so that subsequent calls are routed directly to the chosen implementation. @@ -39,6 +60,15 @@ pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) { unsigned int exx[4] = {0, 0, 0, 0}; +#ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK + + /* + * Set fallback. We must guard since slicing-by-8 is not visible + * everywhere. + */ + pg_comp_crc32c = pg_comp_crc32c_sb8; +#endif + #if defined(HAVE__GET_CPUID) __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); #elif defined(HAVE__CPUID) @@ -50,15 +80,26 @@ pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) if ((exx[2] & (1 << 20)) != 0) /* SSE 4.2 */ { pg_comp_crc32c = pg_comp_crc32c_sse42; -#ifdef USE_PCLMUL_WITH_RUNTIME_CHECK - if ((exx[2] & (1 << 1)) != 0) /* PCLMUL */ - pg_comp_crc32c = pg_comp_crc32c_pclmul; + + if (exx[2] & (1 << 27) && /* OSXSAVE */ + zmm_regs_available()) + { +#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK + /* second cpuid call on leaf 7 to check extended avx512 support */ + + memset(exx, 0, 4 * sizeof(exx[0])); + +#if defined(HAVE__GET_CPUID_COUNT) + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); +#elif defined(HAVE__CPUIDEX) + __cpuidex(exx, 7, 0); #endif - } -#ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK - else - pg_comp_crc32c = pg_comp_crc32c_sb8; + if (exx[2] & (1 << 10) && /* VPCLMULQDQ */ + exx[1] & (1 << 31)) /* AVX512-VL */ + pg_comp_crc32c = pg_comp_crc32c_pclmul; #endif + } + } return pg_comp_crc32c(crc, data, len); } -- 2.48.1
From 6ae6af741b866ed95d367d810cdd4eef64a6ac91 Mon Sep 17 00:00:00 2001 From: John Naylor <john.nay...@postgresql.org> Date: Tue, 11 Mar 2025 14:16:13 +0700 Subject: [PATCH v14 6/8] AVX-512 CRC / Meson Author: Raghuveer Devulapalli <raghuveer.devulapa...@intel.com> Author: Paul Amonson <paul.d.amon...@intel.com> --- meson.build | 23 ++++++++++ src/include/port/pg_crc32c.h | 9 +--- src/port/pg_crc32c_sse42.c | 83 +++++++++++++++--------------------- 3 files changed, 60 insertions(+), 55 deletions(-) diff --git a/meson.build b/meson.build index 13c13748e5d..f2f1164a25e 100644 --- a/meson.build +++ b/meson.build @@ -2352,6 +2352,29 @@ int main(void) have_optimized_crc = true endif + avx512_crc_prog = ''' +#include <immintrin.h> +#include <stdint.h> +#if defined(__has_attribute) && __has_attribute (target) +__attribute__((target("avx512vl,vpclmulqdq"))) +#endif +int main(void) +{ + __m512i x0 = _mm512_set1_epi32(0x1); + __m512i x1 = _mm512_set1_epi32(0x2); + __m512i x2 = _mm512_clmulepi64_epi128(x1, x0, 0x00); // vpclmulqdq + __m128i a1 = _mm_xor_epi64(_mm512_castsi512_si128(x1), _mm512_castsi512_si128(x0)); //avx512vl + int64_t val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0)); // 64-bit instruction + return (uint32_t)_mm_crc32_u64(val, _mm_extract_epi64(a1, 1)); +} +''' + + if cc.links(avx512_crc_prog, + name: 'AVX-512 CRC32C', + args: test_c_args) + cdata.set('USE_AVX512_CRC32C_WITH_RUNTIME_CHECK', 1) + endif + endif elif host_cpu == 'arm' or host_cpu == 'aarch64' diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h index 28253b48018..a45f56a9405 100644 --- a/src/include/port/pg_crc32c.h +++ b/src/include/port/pg_crc32c.h @@ -37,11 +37,6 @@ typedef uint32 pg_crc32c; -/* WIP: configure checks */ -#ifdef __x86_64__ -#define USE_PCLMUL_WITH_RUNTIME_CHECK -#endif - /* The INIT and EQ macros are the same for all implementations. */ #define INIT_CRC32C(crc) ((crc) = 0xFFFFFFFF) #define EQ_CRC32C(c1, c2) ((c1) == (c2)) @@ -60,7 +55,7 @@ typedef uint32 pg_crc32c; extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len); extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len); -#ifdef USE_PCLMUL_WITH_RUNTIME_CHECK +#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK extern pg_crc32c pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t len); #endif @@ -106,7 +101,7 @@ pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len) extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len); extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len); extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len); -#ifdef USE_PCLMUL_WITH_RUNTIME_CHECK +#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK extern pg_crc32c pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t len); #endif diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c index c57d6c6293b..f392eb5b236 100644 --- a/src/port/pg_crc32c_sse42.c +++ b/src/port/pg_crc32c_sse42.c @@ -15,7 +15,7 @@ #include "c.h" #include <nmmintrin.h> -#include <wmmintrin.h> +#include <immintrin.h> #include "port/pg_crc32c.h" @@ -70,16 +70,16 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len) return crc; } -#ifdef USE_PCLMUL_WITH_RUNTIME_CHECK +#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK /* Generated by https://github.com/corsix/fast-crc32/ using: */ -/* ./generate -i sse -p crc32c -a v4e */ +/* ./generate -i avx512_vpclmulqdq -p crc32c -a v1e */ /* MIT licensed */ -#define clmul_lo(a, b) (_mm_clmulepi64_si128((a), (b), 0)) -#define clmul_hi(a, b) (_mm_clmulepi64_si128((a), (b), 17)) +#define clmul_lo(a, b) (_mm512_clmulepi64_epi128((a), (b), 0)) +#define clmul_hi(a, b) (_mm512_clmulepi64_epi128((a), (b), 17)) -pg_attribute_target("sse4.2,pclmul") +pg_attribute_target("avx512vl,vpclmulqdq") pg_crc32c pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t length) { @@ -88,67 +88,54 @@ pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t length) size_t len = length; const char *buf = data; - // This prolog is trying to avoid loads straddling - // cache lines, but it doesn't seem worth it if - // we're trying to be fast on small inputs as well -#if 0 - for (; len && ((uintptr_t) buf & 7); --len) + /* Align on cacheline boundary. WIP: The threshold needs testing. */ + if (unlikely(len > 256)) { - crc0 = _mm_crc32_u8(crc0, *buf++); - } - if (((uintptr_t) buf & 8) && len >= 8) - { - crc0 = _mm_crc32_u64(crc0, *(const uint64_t *) buf); - buf += 8; - len -= 8; + for (; len && ((uintptr_t) buf & 7); --len) + { + crc0 = _mm_crc32_u8(crc0, *buf++); + } + while (((uintptr_t) buf & 56) && len >= 8) + { + crc0 = _mm_crc32_u64(crc0, *(const uint64_t *) buf); + buf += 8; + len -= 8; + } } -#endif + if (len >= 64) { const char *end = buf + len; const char *limit = buf + len - 64; + __m128i z0; /* First vector chunk. */ - __m128i x0 = _mm_loadu_si128((const __m128i *) buf), + __m512i x0 = _mm512_loadu_si512((const void *) buf), y0; - __m128i x1 = _mm_loadu_si128((const __m128i *) (buf + 16)), - y1; - __m128i x2 = _mm_loadu_si128((const __m128i *) (buf + 32)), - y2; - __m128i x3 = _mm_loadu_si128((const __m128i *) (buf + 48)), - y3; - __m128i k; - - k = _mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0); - x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0); + __m512i k; + + k = _mm512_broadcast_i32x4(_mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0)); + x0 = _mm512_xor_si512(_mm512_castsi128_si512(_mm_cvtsi32_si128(crc0)), x0); buf += 64; + /* Main loop. */ while (buf <= limit) { y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k); - y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); - y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k); - y0 = _mm_xor_si128(y0, _mm_loadu_si128((const __m128i *) buf)), x0 = _mm_xor_si128(x0, y0); - y1 = _mm_xor_si128(y1, _mm_loadu_si128((const __m128i *) (buf + 16))), x1 = _mm_xor_si128(x1, y1); - y2 = _mm_xor_si128(y2, _mm_loadu_si128((const __m128i *) (buf + 32))), x2 = _mm_xor_si128(x2, y2); - y3 = _mm_xor_si128(y3, _mm_loadu_si128((const __m128i *) (buf + 48))), x3 = _mm_xor_si128(x3, y3); + x0 = _mm512_ternarylogic_epi64(x0, y0, _mm512_loadu_si512((const void *) buf), 0x96); buf += 64; } - /* Reduce x0 ... x3 to just x0. */ - k = _mm_setr_epi32(0xf20c0dfe, 0, 0x493c7d27, 0); - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); - y0 = _mm_xor_si128(y0, x1), x0 = _mm_xor_si128(x0, y0); - y2 = _mm_xor_si128(y2, x3), x2 = _mm_xor_si128(x2, y2); - k = _mm_setr_epi32(0x3da6d0cb, 0, 0xba4fc28e, 0); - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - y0 = _mm_xor_si128(y0, x2), x0 = _mm_xor_si128(x0, y0); + /* Reduce 512 bits to 128 bits. */ + k = _mm512_setr_epi32(0x1c291d04, 0, 0xddc0152b, 0, 0x3da6d0cb, 0, 0xba4fc28e, 0, 0xf20c0dfe, 0, 0x493c7d27, 0, 0, 0, 0, 0); + y0 = clmul_lo(x0, k), k = clmul_hi(x0, k); + y0 = _mm512_xor_si512(y0, k); + z0 = _mm_ternarylogic_epi64(_mm512_castsi512_si128(y0), _mm512_extracti32x4_epi32(y0, 1), _mm512_extracti32x4_epi32(y0, 2), 0x96); + z0 = _mm_xor_si128(z0, _mm512_extracti32x4_epi32(x0, 3)); /* Reduce 128 bits to 32 bits, and multiply by x^32. */ - crc0 = _mm_crc32_u64(0, _mm_extract_epi64(x0, 0)); - crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(x0, 1)); + crc0 = _mm_crc32_u64(0, _mm_extract_epi64(z0, 0)); + crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(z0, 1)); len = end - buf; } -- 2.48.1
From 9c61eb35c4f104c208e22502b7e10fb2a8efdc14 Mon Sep 17 00:00:00 2001 From: John Naylor <john.nay...@postgresql.org> Date: Fri, 28 Feb 2025 16:27:30 +0700 Subject: [PATCH v14 2/8] Inline CRC computation for fixed-length input Use a simplified copy of the loop in pg_crc32c_sse42.c to avoid moving code to a separate header. --- src/include/port/pg_crc32c.h | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h index 65ebeacf4b1..b9f0a8c7cca 100644 --- a/src/include/port/pg_crc32c.h +++ b/src/include/port/pg_crc32c.h @@ -43,12 +43,43 @@ typedef uint32 pg_crc32c; #if defined(USE_SSE42_CRC32C) /* Use Intel SSE4.2 instructions. */ + +#include <nmmintrin.h> + #define COMP_CRC32C(crc, data, len) \ - ((crc) = pg_comp_crc32c_sse42((crc), (data), (len))) + ((crc) = pg_comp_crc32c_dispatch((crc), (data), (len))) #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF) extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len); +pg_attribute_no_sanitize_alignment() +static inline +pg_crc32c +pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len) +{ + if (__builtin_constant_p(len)) + { + const unsigned char *p = data; + + /* + * For constant inputs, inline the computation to avoid the + * indirect function call. This also allows the compiler to unroll + * loops for small inputs. + */ +#if SIZEOF_VOID_P >= 8 + for (; len >= 8; p += 8, len -= 8) + crc = _mm_crc32_u64(crc, *(const uint64 *) p); +#endif + for (; len >= 4; p += 4, len -= 4) + crc = _mm_crc32_u32(crc, *(const uint32 *) p); + for (; len > 0; --len) + crc = _mm_crc32_u8(crc, *p++); + return crc; + } + else + return pg_comp_crc32c_sse42(crc, data, len); +} + #elif defined(USE_ARMV8_CRC32C) /* Use ARMv8 CRC Extension instructions. */ -- 2.48.1
From 98e0d9cf96cc9a5eb8ce1eac9517693bb078bfe2 Mon Sep 17 00:00:00 2001 From: John Naylor <john.nay...@postgresql.org> Date: Wed, 12 Feb 2025 15:27:16 +0700 Subject: [PATCH v14 3/8] Improve CRC32C performance on x86_64 The current SSE4.2 implementation of CRC32C relies on the native CRC32 instruction, which operates on 8 bytes at a time. We can get a substantial speedup on longer inputs by using carryless multiplication on SIMD registers, processing 64 bytes per loop iteration. The PCLMULQDQ instruction has been widely available since 2011 (almost as old as SSE 4.2), so this commit now requires that, as well as SSE 4.2, to build pg_crc32c_sse42.c. The MIT-licensed implementation was generated with the "generate" program from https://github.com/corsix/fast-crc32/ Based on: "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" V. Gopal, E. Ozturk, et al., 2009 Author: Raghuveer Devulapalli <raghuveer.devulapa...@intel.com> Author: John Naylor <johncnaylo...@gmail.com> Discussion: https://postgr.es/m/ph8pr11mb82869ff741dfa4e9a029ff13fb...@ph8pr11mb8286.namprd11.prod.outlook.com --- src/include/port/pg_crc32c.h | 30 ++++++++--- src/port/pg_crc32c_sse42.c | 88 +++++++++++++++++++++++++++++++ src/port/pg_crc32c_sse42_choose.c | 26 ++++----- 3 files changed, 124 insertions(+), 20 deletions(-) diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h index b9f0a8c7cca..229f4f6a65a 100644 --- a/src/include/port/pg_crc32c.h +++ b/src/include/port/pg_crc32c.h @@ -37,6 +37,11 @@ typedef uint32 pg_crc32c; +/* WIP: configure checks */ +#ifdef __x86_64__ +#define USE_PCLMUL_WITH_RUNTIME_CHECK +#endif + /* The INIT and EQ macros are the same for all implementations. */ #define INIT_CRC32C(crc) ((crc) = 0xFFFFFFFF) #define EQ_CRC32C(c1, c2) ((c1) == (c2)) @@ -80,6 +85,23 @@ pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len) return pg_comp_crc32c_sse42(crc, data, len); } +#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) + +/* + * Use Intel SSE 4.2 or PCLMUL instructions, but perform a runtime check first + * to check that they are available. + */ +#define COMP_CRC32C(crc, data, len) \ + ((crc) = pg_comp_crc32c((crc), (data), (len))) +#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF) + +extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len); +extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len); +extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len); +#ifdef USE_PCLMUL_WITH_RUNTIME_CHECK +extern pg_crc32c pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t len); +#endif + #elif defined(USE_ARMV8_CRC32C) /* Use ARMv8 CRC Extension instructions. */ @@ -98,7 +120,7 @@ extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t le extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len); -#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK) +#elif defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK) /* * Use Intel SSE 4.2 or ARMv8 instructions, but perform a runtime check first @@ -110,13 +132,7 @@ extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_ extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len); extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len); - -#ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK -extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len); -#endif -#ifdef USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len); -#endif #else /* diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c index 22c2137df31..2001e69850b 100644 --- a/src/port/pg_crc32c_sse42.c +++ b/src/port/pg_crc32c_sse42.c @@ -15,6 +15,7 @@ #include "c.h" #include <nmmintrin.h> +#include <wmmintrin.h> #include "port/pg_crc32c.h" @@ -68,3 +69,90 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len) return crc; } + +#ifdef USE_PCLMUL_WITH_RUNTIME_CHECK + +/* Generated by https://github.com/corsix/fast-crc32/ using: */ +/* ./generate -i sse -p crc32c -a v4e */ +/* MIT licensed */ + +#define clmul_lo(a, b) (_mm_clmulepi64_si128((a), (b), 0)) +#define clmul_hi(a, b) (_mm_clmulepi64_si128((a), (b), 17)) + +pg_attribute_target("sse4.2,pclmul") +pg_crc32c +pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t length) +{ + /* adjust names to match generated code */ + pg_crc32c crc0 = crc; + size_t len = length; + const char *buf = data; + + // This prolog is trying to avoid loads straddling + // cache lines, but it doesn't seem worth it if + // we're trying to be fast on small inputs as well +#if 0 + for (; len && ((uintptr_t) buf & 7); --len) + { + crc0 = _mm_crc32_u8(crc0, *buf++); + } + if (((uintptr_t) buf & 8) && len >= 8) + { + crc0 = _mm_crc32_u64(crc0, *(const uint64_t *) buf); + buf += 8; + len -= 8; + } +#endif + if (len >= 64) + { + const char *end = buf + len; + const char *limit = buf + len - 64; + + /* First vector chunk. */ + __m128i x0 = _mm_loadu_si128((const __m128i *) buf), + y0; + __m128i x1 = _mm_loadu_si128((const __m128i *) (buf + 16)), + y1; + __m128i x2 = _mm_loadu_si128((const __m128i *) (buf + 32)), + y2; + __m128i x3 = _mm_loadu_si128((const __m128i *) (buf + 48)), + y3; + __m128i k; + + k = _mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0); + x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0); + buf += 64; + /* Main loop. */ + while (buf <= limit) + { + y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); + y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k); + y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); + y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k); + y0 = _mm_xor_si128(y0, _mm_loadu_si128((const __m128i *) buf)), x0 = _mm_xor_si128(x0, y0); + y1 = _mm_xor_si128(y1, _mm_loadu_si128((const __m128i *) (buf + 16))), x1 = _mm_xor_si128(x1, y1); + y2 = _mm_xor_si128(y2, _mm_loadu_si128((const __m128i *) (buf + 32))), x2 = _mm_xor_si128(x2, y2); + y3 = _mm_xor_si128(y3, _mm_loadu_si128((const __m128i *) (buf + 48))), x3 = _mm_xor_si128(x3, y3); + buf += 64; + } + + /* Reduce x0 ... x3 to just x0. */ + k = _mm_setr_epi32(0xf20c0dfe, 0, 0x493c7d27, 0); + y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); + y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); + y0 = _mm_xor_si128(y0, x1), x0 = _mm_xor_si128(x0, y0); + y2 = _mm_xor_si128(y2, x3), x2 = _mm_xor_si128(x2, y2); + k = _mm_setr_epi32(0x3da6d0cb, 0, 0xba4fc28e, 0); + y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); + y0 = _mm_xor_si128(y0, x2), x0 = _mm_xor_si128(x0, y0); + + /* Reduce 128 bits to 32 bits, and multiply by x^32. */ + crc0 = _mm_crc32_u64(0, _mm_extract_epi64(x0, 0)); + crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(x0, 1)); + len = end - buf; + } + + return pg_comp_crc32c_sse42_inline(crc0, buf, len); +} + +#endif diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_crc32c_sse42_choose.c index 65dbc4d4249..abea0f90eb3 100644 --- a/src/port/pg_crc32c_sse42_choose.c +++ b/src/port/pg_crc32c_sse42_choose.c @@ -30,8 +30,12 @@ #include "port/pg_crc32c.h" -static bool -pg_crc32c_sse42_available(void) +/* + * This gets called on the first call. It replaces the function pointer + * so that subsequent calls are routed directly to the chosen implementation. + */ +static pg_crc32c +pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) { unsigned int exx[4] = {0, 0, 0, 0}; @@ -43,18 +47,14 @@ pg_crc32c_sse42_available(void) #error cpuid instruction not available #endif - return (exx[2] & (1 << 20)) != 0; /* SSE 4.2 */ -} - -/* - * This gets called on the first call. It replaces the function pointer - * so that subsequent calls are routed directly to the chosen implementation. - */ -static pg_crc32c -pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) -{ - if (pg_crc32c_sse42_available()) + if ((exx[2] & (1 << 20)) != 0) /* SSE 4.2 */ + { pg_comp_crc32c = pg_comp_crc32c_sse42; +#ifdef USE_PCLMUL_WITH_RUNTIME_CHECK + if ((exx[2] & (1 << 1)) != 0) /* PCLMUL */ + pg_comp_crc32c = pg_comp_crc32c_pclmul; +#endif + } else pg_comp_crc32c = pg_comp_crc32c_sb8; -- 2.48.1
From 0b978f21bba12d62477c5fa982f99a76563cfaf4 Mon Sep 17 00:00:00 2001 From: Paul Amonson <paul.d.amon...@intel.com> Date: Mon, 6 May 2024 08:34:17 -0700 Subject: [PATCH v14 1/8] Add a Postgres SQL function for crc32c benchmarking Add a drive_crc32c() function to use for benchmarking crc32c computation. The function takes 2 arguments: (1) count: num of times CRC32C is computed in a loop. (2) num: #bytes in the buffer to calculate crc over. XXX not for commit Extracted from a patch by Raghuveer Devulapalli --- contrib/meson.build | 1 + contrib/test_crc32c/Makefile | 20 +++++++ contrib/test_crc32c/expected/test_crc32c.out | 57 ++++++++++++++++++++ contrib/test_crc32c/meson.build | 34 ++++++++++++ contrib/test_crc32c/sql/test_crc32c.sql | 3 ++ contrib/test_crc32c/test_crc32c--1.0.sql | 1 + contrib/test_crc32c/test_crc32c.c | 47 ++++++++++++++++ contrib/test_crc32c/test_crc32c.control | 4 ++ 8 files changed, 167 insertions(+) create mode 100644 contrib/test_crc32c/Makefile create mode 100644 contrib/test_crc32c/expected/test_crc32c.out create mode 100644 contrib/test_crc32c/meson.build create mode 100644 contrib/test_crc32c/sql/test_crc32c.sql create mode 100644 contrib/test_crc32c/test_crc32c--1.0.sql create mode 100644 contrib/test_crc32c/test_crc32c.c create mode 100644 contrib/test_crc32c/test_crc32c.control diff --git a/contrib/meson.build b/contrib/meson.build index 1ba73ebd67a..06673db0625 100644 --- a/contrib/meson.build +++ b/contrib/meson.build @@ -12,6 +12,7 @@ contrib_doc_args = { 'install_dir': contrib_doc_dir, } +subdir('test_crc32c') subdir('amcheck') subdir('auth_delay') subdir('auto_explain') diff --git a/contrib/test_crc32c/Makefile b/contrib/test_crc32c/Makefile new file mode 100644 index 00000000000..5b747c6184a --- /dev/null +++ b/contrib/test_crc32c/Makefile @@ -0,0 +1,20 @@ +MODULE_big = test_crc32c +OBJS = test_crc32c.o +PGFILEDESC = "test" +EXTENSION = test_crc32c +DATA = test_crc32c--1.0.sql + +first: all + +# test_crc32c.o: CFLAGS+=-g + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = src/test/modules/test_crc32c +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/test_crc32c/expected/test_crc32c.out b/contrib/test_crc32c/expected/test_crc32c.out new file mode 100644 index 00000000000..dff6bb3133b --- /dev/null +++ b/contrib/test_crc32c/expected/test_crc32c.out @@ -0,0 +1,57 @@ +CREATE EXTENSION test_crc32c; +select drive_crc32c(1, i) from generate_series(100, 300, 4) i; + drive_crc32c +-------------- + 532139994 + 2103623867 + 785984197 + 2686825890 + 3213049059 + 3819630168 + 1389234603 + 534072900 + 2930108140 + 2496889855 + 1475239611 + 136366931 + 3067402116 + 2012717871 + 3682416023 + 2054270645 + 1817339875 + 4100939569 + 1192727539 + 3636976218 + 369764421 + 3161609879 + 1067984880 + 1235066769 + 3138425899 + 648132037 + 4203750233 + 1330187888 + 2683521348 + 1951644495 + 2574090107 + 3904902018 + 3772697795 + 1644686344 + 2868962106 + 3369218491 + 3902689890 + 3456411865 + 141004025 + 1504497996 + 3782655204 + 3544797610 + 3429174879 + 2524728016 + 3935861181 + 25498897 + 692684159 + 345705535 + 2761600287 + 2654632420 + 3945991399 +(51 rows) + diff --git a/contrib/test_crc32c/meson.build b/contrib/test_crc32c/meson.build new file mode 100644 index 00000000000..d7bec4ba1cb --- /dev/null +++ b/contrib/test_crc32c/meson.build @@ -0,0 +1,34 @@ +# Copyright (c) 2022-2024, PostgreSQL Global Development Group + +test_crc32c_sources = files( + 'test_crc32c.c', +) + +if host_system == 'windows' + test_crc32c_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'test_crc32c', + '--FILEDESC', 'test_crc32c - test code for crc32c library',]) +endif + +test_crc32c = shared_module('test_crc32c', + test_crc32c_sources, + kwargs: contrib_mod_args, +) +contrib_targets += test_crc32c + +install_data( + 'test_crc32c.control', + 'test_crc32c--1.0.sql', + kwargs: contrib_data_args, +) + +tests += { + 'name': 'test_crc32c', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'regress': { + 'sql': [ + 'test_crc32c', + ], + }, +} diff --git a/contrib/test_crc32c/sql/test_crc32c.sql b/contrib/test_crc32c/sql/test_crc32c.sql new file mode 100644 index 00000000000..95c6dfe4488 --- /dev/null +++ b/contrib/test_crc32c/sql/test_crc32c.sql @@ -0,0 +1,3 @@ +CREATE EXTENSION test_crc32c; + +select drive_crc32c(1, i) from generate_series(100, 300, 4) i; diff --git a/contrib/test_crc32c/test_crc32c--1.0.sql b/contrib/test_crc32c/test_crc32c--1.0.sql new file mode 100644 index 00000000000..52b9772f908 --- /dev/null +++ b/contrib/test_crc32c/test_crc32c--1.0.sql @@ -0,0 +1 @@ +CREATE FUNCTION drive_crc32c (count int, num int) RETURNS bigint AS 'MODULE_PATHNAME' LANGUAGE C; diff --git a/contrib/test_crc32c/test_crc32c.c b/contrib/test_crc32c/test_crc32c.c new file mode 100644 index 00000000000..28bc42de314 --- /dev/null +++ b/contrib/test_crc32c/test_crc32c.c @@ -0,0 +1,47 @@ +/* select drive_crc32c(1000000, 1024); */ + +#include "postgres.h" +#include "fmgr.h" +#include "port/pg_crc32c.h" +#include "common/pg_prng.h" + +PG_MODULE_MAGIC; + +/* + * drive_crc32c(count: int, num: int) returns bigint + * + * count is the nuimber of loops to perform + * + * num is the number byte in the buffer to calculate + * crc32c over. + */ +PG_FUNCTION_INFO_V1(drive_crc32c); +Datum +drive_crc32c(PG_FUNCTION_ARGS) +{ + int64 count = PG_GETARG_INT32(0); + int64 num = PG_GETARG_INT32(1); + char* data = malloc((size_t)num); + pg_crc32c crc; + pg_prng_state state; + uint64 seed = 42; + pg_prng_seed(&state, seed); + /* set random data */ + for (uint64 i = 0; i < num; i++) + { + data[i] = pg_prng_uint32(&state) % 255; + } + + INIT_CRC32C(crc); + + while(count--) + { + INIT_CRC32C(crc); + COMP_CRC32C(crc, data, num); + FIN_CRC32C(crc); + } + + free((void *)data); + + PG_RETURN_INT64((int64_t)crc); +} diff --git a/contrib/test_crc32c/test_crc32c.control b/contrib/test_crc32c/test_crc32c.control new file mode 100644 index 00000000000..878a077ee18 --- /dev/null +++ b/contrib/test_crc32c/test_crc32c.control @@ -0,0 +1,4 @@ +comment = 'test' +default_version = '1.0' +module_pathname = '$libdir/test_crc32c' +relocatable = true -- 2.48.1