Hi, So here is V2 of the crc32c_s390x patch. Changes from V1 are:
- added gcc 14-14.2 as known broken compiler (bug was fixed with 14.3) - moved broken compiler check to vx extension compile&link check - made variables global in the extension check - create dependency to getauxval in configure, so we don't compile the code if we won't be able to detect the cpu extension at runtime - moved buffer length check into macro - changed minimal buffer length for crc32c_s390x from 64 to 16 byte - added CFLAGS_CRC to all crc32c_s390x* artifacts - fixed formatting with pgindent - fixed typos in email address -- Eduard Stefes <eduard.ste...@ibm.com>
From b0b9ed50cec3a9f5e856659617883c8a1b926991 Mon Sep 17 00:00:00 2001 From: "Eddy (Eduard) Stefes" <eduard.ste...@ibm.com> Date: Tue, 15 Apr 2025 10:22:05 +0200 Subject: [PATCH v2] Added crc32c extension for ibm s390x based on VX intrinsics --- config/c-compiler.m4 | 47 ++++++ configure | 262 ++++++++++++++++++++++++++++- configure.ac | 53 +++++- meson.build | 43 +++++ src/include/pg_config.h.in | 6 + src/include/port/pg_crc32c.h | 25 +++ src/port/Makefile | 5 +- src/port/meson.build | 7 + src/port/pg_crc32c_s390x.c | 263 ++++++++++++++++++++++++++++++ src/port/pg_crc32c_s390x_choose.c | 43 +++++ 10 files changed, 741 insertions(+), 13 deletions(-) create mode 100644 src/port/pg_crc32c_s390x.c create mode 100644 src/port/pg_crc32c_s390x_choose.c diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 5f3e1d1faf9..47cb53dbd56 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -684,6 +684,53 @@ fi undefine([Ac_cachevar])dnl ])# PGAC_LOONGARCH_CRC32C_INTRINSICS +# PGAC_S390X_VECTOR_VX_INTRINSICS +# -------------------------------- +# Check if the compiler supports the S390X vector intrinsics, using +# __attribute__((vector_size(16))), vec_gfmsum_accum_128 +# +# These instructions where introduced with -march=z13. +# the test arg1 is mandatory and should be either: +# '-fzvector' for clang +# '-mzarch' for gcc +# +# If the intrinsics are supported, sets +# pgac_s390x_vector_intrinsics, and CFLAGS_CRC. +AC_DEFUN([PGAC_S390X_VECTOR_VX_INTRINSICS], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_s390x_vector_intrinsics_$1_$2])])dnl +AC_CACHE_CHECK([for __attribute__((vector_size(16))), vec_gfmsum_accum_128 with CFLAGS=$1 $2], [Ac_cachevar], +[pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS $1 $2" +AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <vecintrin.h> + #ifdef __clang__ + # if ((__clang_major__ == 18) || (__clang_major__ == 19 && (__clang_minor__ < 1 || (__clang_minor__ == 1 && __clang_patchlevel__ < 2)))) + # error CRC32-VX optimizations are broken due to compiler bug in Clang versions: 18.0.0 <= clang_version < 19.1.2. Either disable the CRC32-VX optimization, or switch to another compiler/compiler version. + # endif + #endif + #ifdef __GNUC__ + # if ((__GNUC__ == 14) && (__GNUC_MINOR__ < 3)) + # error CRC32-VX optimizations are broken due to compiler bug in GCC versions: 14.0.0 <= gcc_version < 14.3.0 Either disable the CRC32-VX optimization, or switch to another compiler/compiler version. + # endif + #endif + unsigned long long a __attribute__((vector_size(16))) = { 0 }; + unsigned long long b __attribute__((vector_size(16))) = { 0 }; + unsigned char c __attribute__((vector_size(16))) = { 0 }; + static void vecint_gfmsum_accum_test(void){ + c = vec_gfmsum_accum_128(a, b, c); + }], + [ + vecint_gfmsum_accum_test(); + return 0;])], + [Ac_cachevar=yes], + [Ac_cachevar=no]) +CFLAGS="$pgac_save_CFLAGS"]) +if test x"$Ac_cachevar" = x"yes"; then + CFLAGS_CRC="$1 $2" + AS_TR_SH([pgac_s390x_vector_intrinsics_$1_$2])=yes +fi +undefine([Ac_cachevar])dnl +])# PGAC_S390X_VECTOR_VX_INTRINSICS + # PGAC_XSAVE_INTRINSICS # --------------------- # Check if the compiler supports the XSAVE instructions using the _xgetbv diff --git a/configure b/configure index 4f15347cc95..0f2ec308eaa 100755 --- a/configure +++ b/configure @@ -17541,7 +17541,6 @@ $as_echo "#define HAVE_GCC__ATOMIC_INT64_CAS 1" >>confdefs.h fi - # Check for x86 cpuid instruction { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __get_cpuid" >&5 $as_echo_n "checking for __get_cpuid... " >&6; } @@ -18088,6 +18087,227 @@ if test x"$pgac_cv_loongarch_crc32c_intrinsics" = x"yes"; then fi +# Check for S390X Vector intrinsics to do CRC calculations. +# +# First check for the host cpu +if test x"$host_cpu" = x"s390x" && test x"$ac_cv_func_getauxval" = x"yes"; then + # Check for all possible cflag combinations + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __attribute__((vector_size(16))), vec_gfmsum_accum_128 with CFLAGS=-fzvector " >&5 +$as_echo_n "checking for __attribute__((vector_size(16))), vec_gfmsum_accum_128 with CFLAGS=-fzvector ... " >&6; } +if ${pgac_cv_s390x_vector_intrinsics__fzvector_+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS -fzvector " +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include <vecintrin.h> + #ifdef __clang__ + # if ((__clang_major__ == 18) || (__clang_major__ == 19 && (__clang_minor__ < 1 || (__clang_minor__ == 1 && __clang_patchlevel__ < 2)))) + # error CRC32-VX optimizations are broken due to compiler bug in Clang versions: 18.0.0 <= clang_version < 19.1.2. Either disable the CRC32-VX optimization, or switch to another compiler/compiler version. + # endif + #endif + #ifdef __GNUC__ + # if ((__GNUC__ == 14) && (__GNUC_MINOR__ < 3)) + # error CRC32-VX optimizations are broken due to compiler bug in GCC versions: 14.0.0 <= gcc_version < 14.3.0 Either disable the CRC32-VX optimization, or switch to another compiler/compiler version. + # endif + #endif + unsigned long long a __attribute__((vector_size(16))) = { 0 }; + unsigned long long b __attribute__((vector_size(16))) = { 0 }; + unsigned char c __attribute__((vector_size(16))) = { 0 }; + static void vecint_gfmsum_accum_test(void){ + c = vec_gfmsum_accum_128(a, b, c); + } +int +main () +{ + + vecint_gfmsum_accum_test(); + return 0; + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_s390x_vector_intrinsics__fzvector_=yes +else + pgac_cv_s390x_vector_intrinsics__fzvector_=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +CFLAGS="$pgac_save_CFLAGS" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_s390x_vector_intrinsics__fzvector_" >&5 +$as_echo "$pgac_cv_s390x_vector_intrinsics__fzvector_" >&6; } +if test x"$pgac_cv_s390x_vector_intrinsics__fzvector_" = x"yes"; then + CFLAGS_CRC="-fzvector " + pgac_s390x_vector_intrinsics__fzvector_=yes +fi + + if test x"$pgac_s390x_vector_intrinsics__fzvector_" != x"yes"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __attribute__((vector_size(16))), vec_gfmsum_accum_128 with CFLAGS=-fzvector -march=z13" >&5 +$as_echo_n "checking for __attribute__((vector_size(16))), vec_gfmsum_accum_128 with CFLAGS=-fzvector -march=z13... " >&6; } +if ${pgac_cv_s390x_vector_intrinsics__fzvector__march_z13+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS -fzvector -march=z13" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include <vecintrin.h> + #ifdef __clang__ + # if ((__clang_major__ == 18) || (__clang_major__ == 19 && (__clang_minor__ < 1 || (__clang_minor__ == 1 && __clang_patchlevel__ < 2)))) + # error CRC32-VX optimizations are broken due to compiler bug in Clang versions: 18.0.0 <= clang_version < 19.1.2. Either disable the CRC32-VX optimization, or switch to another compiler/compiler version. + # endif + #endif + #ifdef __GNUC__ + # if ((__GNUC__ == 14) && (__GNUC_MINOR__ < 3)) + # error CRC32-VX optimizations are broken due to compiler bug in GCC versions: 14.0.0 <= gcc_version < 14.3.0 Either disable the CRC32-VX optimization, or switch to another compiler/compiler version. + # endif + #endif + unsigned long long a __attribute__((vector_size(16))) = { 0 }; + unsigned long long b __attribute__((vector_size(16))) = { 0 }; + unsigned char c __attribute__((vector_size(16))) = { 0 }; + static void vecint_gfmsum_accum_test(void){ + c = vec_gfmsum_accum_128(a, b, c); + } +int +main () +{ + + vecint_gfmsum_accum_test(); + return 0; + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_s390x_vector_intrinsics__fzvector__march_z13=yes +else + pgac_cv_s390x_vector_intrinsics__fzvector__march_z13=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +CFLAGS="$pgac_save_CFLAGS" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_s390x_vector_intrinsics__fzvector__march_z13" >&5 +$as_echo "$pgac_cv_s390x_vector_intrinsics__fzvector__march_z13" >&6; } +if test x"$pgac_cv_s390x_vector_intrinsics__fzvector__march_z13" = x"yes"; then + CFLAGS_CRC="-fzvector -march=z13" + pgac_s390x_vector_intrinsics__fzvector__march_z13=yes +fi + + if test x"$pgac_s390x_vector_intrinsics__fzvector__march_z13" != x"yes"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __attribute__((vector_size(16))), vec_gfmsum_accum_128 with CFLAGS=-mzarch " >&5 +$as_echo_n "checking for __attribute__((vector_size(16))), vec_gfmsum_accum_128 with CFLAGS=-mzarch ... " >&6; } +if ${pgac_cv_s390x_vector_intrinsics__mzarch_+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS -mzarch " +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include <vecintrin.h> + #ifdef __clang__ + # if ((__clang_major__ == 18) || (__clang_major__ == 19 && (__clang_minor__ < 1 || (__clang_minor__ == 1 && __clang_patchlevel__ < 2)))) + # error CRC32-VX optimizations are broken due to compiler bug in Clang versions: 18.0.0 <= clang_version < 19.1.2. Either disable the CRC32-VX optimization, or switch to another compiler/compiler version. + # endif + #endif + #ifdef __GNUC__ + # if ((__GNUC__ == 14) && (__GNUC_MINOR__ < 3)) + # error CRC32-VX optimizations are broken due to compiler bug in GCC versions: 14.0.0 <= gcc_version < 14.3.0 Either disable the CRC32-VX optimization, or switch to another compiler/compiler version. + # endif + #endif + unsigned long long a __attribute__((vector_size(16))) = { 0 }; + unsigned long long b __attribute__((vector_size(16))) = { 0 }; + unsigned char c __attribute__((vector_size(16))) = { 0 }; + static void vecint_gfmsum_accum_test(void){ + c = vec_gfmsum_accum_128(a, b, c); + } +int +main () +{ + + vecint_gfmsum_accum_test(); + return 0; + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_s390x_vector_intrinsics__mzarch_=yes +else + pgac_cv_s390x_vector_intrinsics__mzarch_=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +CFLAGS="$pgac_save_CFLAGS" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_s390x_vector_intrinsics__mzarch_" >&5 +$as_echo "$pgac_cv_s390x_vector_intrinsics__mzarch_" >&6; } +if test x"$pgac_cv_s390x_vector_intrinsics__mzarch_" = x"yes"; then + CFLAGS_CRC="-mzarch " + pgac_s390x_vector_intrinsics__mzarch_=yes +fi + + if test x"$pgac_s390x_vector_intrinsics__mzarch_" != x"yes"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __attribute__((vector_size(16))), vec_gfmsum_accum_128 with CFLAGS=-mzarch -march=z13" >&5 +$as_echo_n "checking for __attribute__((vector_size(16))), vec_gfmsum_accum_128 with CFLAGS=-mzarch -march=z13... " >&6; } +if ${pgac_cv_s390x_vector_intrinsics__mzarch__march_z13+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS -mzarch -march=z13" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include <vecintrin.h> + #ifdef __clang__ + # if ((__clang_major__ == 18) || (__clang_major__ == 19 && (__clang_minor__ < 1 || (__clang_minor__ == 1 && __clang_patchlevel__ < 2)))) + # error CRC32-VX optimizations are broken due to compiler bug in Clang versions: 18.0.0 <= clang_version < 19.1.2. Either disable the CRC32-VX optimization, or switch to another compiler/compiler version. + # endif + #endif + #ifdef __GNUC__ + # if ((__GNUC__ == 14) && (__GNUC_MINOR__ < 3)) + # error CRC32-VX optimizations are broken due to compiler bug in GCC versions: 14.0.0 <= gcc_version < 14.3.0 Either disable the CRC32-VX optimization, or switch to another compiler/compiler version. + # endif + #endif + unsigned long long a __attribute__((vector_size(16))) = { 0 }; + unsigned long long b __attribute__((vector_size(16))) = { 0 }; + unsigned char c __attribute__((vector_size(16))) = { 0 }; + static void vecint_gfmsum_accum_test(void){ + c = vec_gfmsum_accum_128(a, b, c); + } +int +main () +{ + + vecint_gfmsum_accum_test(); + return 0; + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_s390x_vector_intrinsics__mzarch__march_z13=yes +else + pgac_cv_s390x_vector_intrinsics__mzarch__march_z13=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +CFLAGS="$pgac_save_CFLAGS" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_s390x_vector_intrinsics__mzarch__march_z13" >&5 +$as_echo "$pgac_cv_s390x_vector_intrinsics__mzarch__march_z13" >&6; } +if test x"$pgac_cv_s390x_vector_intrinsics__mzarch__march_z13" = x"yes"; then + CFLAGS_CRC="-mzarch -march=z13" + pgac_s390x_vector_intrinsics__mzarch__march_z13=yes +fi + + fi + fi + fi +fi + # Select CRC-32C implementation. @@ -18140,9 +18360,21 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then USE_LOONGARCH_CRC32C=1 else - # fall back to slicing-by-8 algorithm, which doesn't require any - # special CPU support. - USE_SLICING_BY_8_CRC32C=1 + # Use S390X vector extension. + if (test x"$pgac_s390x_vector_intrinsics__fzvector_" = x"yes" || + test x"$pgac_s390x_vector_intrinsics__mzarch_" = x"yes"); then + USE_S390X_CRC32C=1 + else + # Use S390X vector extension with runtime check. + if test x"$pgac_s390x_vector_intrinsics__fzvector__march_z13" = x"yes" || + test x"$pgac_s390x_vector_intrinsics__mzarch__march_z13" = x"yes"; then + USE_S390X_CRC32C_WITH_RUNTIME_CHECK=1 + else + # fall back to slicing-by-8 algorithm, which doesn't require any + # special CPU support. + USE_SLICING_BY_8_CRC32C=1 + fi + fi fi fi fi @@ -18193,12 +18425,30 @@ $as_echo "#define USE_LOONGARCH_CRC32C 1" >>confdefs.h { $as_echo "$as_me:${as_lineno-$LINENO}: result: LoongArch CRCC instructions" >&5 $as_echo "LoongArch CRCC instructions" >&6; } else + if test x"$USE_S390X_CRC32C" = x"1"; then + +$as_echo "#define USE_S390X_CRC32C 1" >>confdefs.h + + PG_CRC32C_OBJS="pg_crc32c_s390x.o pg_crc32c_sb8.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: S390X Vector instructions for CRC" >&5 +$as_echo "S390X Vector instructions for CRC" >&6; } + else + if test x"$USE_S390X_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then + +$as_echo "#define USE_S390X_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h + + PG_CRC32C_OBJS="pg_crc32c_s390x.o pg_crc32c_s390x_choose.o pg_crc32c_sb8.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: S390X Vector instructions for CRC with runtime check" >&5 +$as_echo "S390X Vector instructions for CRC with runtime check" >&6; } + else $as_echo "#define USE_SLICING_BY_8_CRC32C 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_sb8.o" - { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5 + PG_CRC32C_OBJS="pg_crc32c_sb8.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5 $as_echo "slicing-by-8" >&6; } + fi + fi fi fi fi diff --git a/configure.ac b/configure.ac index 4b8335dc613..f431965c21c 100644 --- a/configure.ac +++ b/configure.ac @@ -2139,6 +2139,23 @@ fi # with the default compiler flags. PGAC_LOONGARCH_CRC32C_INTRINSICS() +# Check for S390X Vector intrinsics to do CRC calculations. +# +# First check for the host cpu +if test x"$host_cpu" = x"s390x" && test x"$ac_cv_func_getauxval" = x"yes"; then + # Check for all possible cflag combinations + PGAC_S390X_VECTOR_VX_INTRINSICS([-fzvector], []) + if test x"$pgac_s390x_vector_intrinsics__fzvector_" != x"yes"; then + PGAC_S390X_VECTOR_VX_INTRINSICS([-fzvector], [-march=z13]) + if test x"$pgac_s390x_vector_intrinsics__fzvector__march_z13" != x"yes"; then + PGAC_S390X_VECTOR_VX_INTRINSICS([-mzarch], []) + if test x"$pgac_s390x_vector_intrinsics__mzarch_" != x"yes"; then + PGAC_S390X_VECTOR_VX_INTRINSICS([-mzarch], [-march=z13]) + fi + fi + fi +fi + AC_SUBST(CFLAGS_CRC) # Select CRC-32C implementation. @@ -2191,9 +2208,21 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then USE_LOONGARCH_CRC32C=1 else - # fall back to slicing-by-8 algorithm, which doesn't require any - # special CPU support. - USE_SLICING_BY_8_CRC32C=1 + # Use S390X vector extension. + if (test x"$pgac_s390x_vector_intrinsics__fzvector_" = x"yes" || + test x"$pgac_s390x_vector_intrinsics__mzarch_" = x"yes"); then + USE_S390X_CRC32C=1 + else + # Use S390X vector extension with runtime check. + if test x"$pgac_s390x_vector_intrinsics__fzvector__march_z13" = x"yes" || + test x"$pgac_s390x_vector_intrinsics__mzarch__march_z13" = x"yes"; then + USE_S390X_CRC32C_WITH_RUNTIME_CHECK=1 + else + # fall back to slicing-by-8 algorithm, which doesn't require any + # special CPU support. + USE_SLICING_BY_8_CRC32C=1 + fi + fi fi fi fi @@ -2228,9 +2257,21 @@ else PG_CRC32C_OBJS="pg_crc32c_loongarch.o" AC_MSG_RESULT(LoongArch CRCC instructions) else - AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).]) - PG_CRC32C_OBJS="pg_crc32c_sb8.o" - AC_MSG_RESULT(slicing-by-8) + if test x"$USE_S390X_CRC32C" = x"1"; then + AC_DEFINE(USE_S390X_CRC32C, 1, [Define to 1 to use S390X Vector instructions for CRC.]) + PG_CRC32C_OBJS="pg_crc32c_s390x.o pg_crc32c_sb8.o" + AC_MSG_RESULT(S390X Vector instructions for CRC) + else + if test x"$USE_S390X_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then + AC_DEFINE(USE_S390X_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use S390X Vector instructions with runtime check for CRC.]) + PG_CRC32C_OBJS="pg_crc32c_s390x.o pg_crc32c_s390x_choose.o pg_crc32c_sb8.o" + AC_MSG_RESULT(S390X Vector instructions for CRC with runtime check) + else + AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).]) + PG_CRC32C_OBJS="pg_crc32c_sb8.o" + AC_MSG_RESULT(slicing-by-8) + fi + fi fi fi fi diff --git a/meson.build b/meson.build index d142e3e408b..3177c6e1b78 100644 --- a/meson.build +++ b/meson.build @@ -2547,6 +2547,49 @@ int main(void) have_optimized_crc = true endif +elif host_cpu == 's390x' and cc.has_function('getauxval') + if cc.get_id() == 'clang' + VGFMAFLAG='-fzvector' + else + VGFMAFLAG='-mzarch' + endif + + prog = ''' +#ifdef __clang__ +# if ((__clang_major__ == 18) || (__clang_major__ == 19 && (__clang_minor__ < 1 || (__clang_minor__ == 1 && __clang_patchlevel__ < 2)))) +# error CRC32-VX optimizations are broken due to compiler bug in Clang versions: 18.0.0 <= clang_version < 19.1.2. Either disable the CRC32-VX optimization, or switch to another compiler/compiler version. +# endif +#endif +#ifdef __GNUC__ +# if ((__GNUC__ == 14) && (__GNUC_MINOR__ < 3)) +# error CRC32-VX optimizations are broken due to compiler bug in GCC versions: 14.0.0 <= gcc_version < 14.3.0 Either disable the CRC32-VX optimization, or switch to another compiler/compiler version. +# endif +#endif +#include <vecintrin.h> +unsigned long long a __attribute__((vector_size(16))) = { 0 }; +unsigned long long b __attribute__((vector_size(16))) = { 0 }; +unsigned char c __attribute__((vector_size(16))) = { 0 }; + +int main(void) { + // test for vector extension + // we can safely assume that if we have vec_gfmsum_accum_128 + // we will have all needed builtins + c = vec_gfmsum_accum_128(a, b, c); + return c[0]; +}''' + if cc.links(prog, name: 's390x_vector_vx', args: test_c_args + [VGFMAFLAG]) + # Use S390X CRC Extension unconditionally + cdata.set('USE_S390X_CRC32C',1) + cflags_crc += VGFMAFLAG + have_optimized_crc = true + elif cc.links(prog, name: 's390x_vector_vx+march=z13', args: test_c_args + [VGFMAFLAG, '-march=z13']) + # Use S390X CRC Extension, with runtime check + cdata.set('USE_S390X_CRC32C',false) + cdata.set('USE_S390X_CRC32C_WITH_RUNTIME_CHECK',1) + cflags_crc += VGFMAFLAG + cflags_crc += '-march=z13' + have_optimized_crc = true + endif endif if not have_optimized_crc diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 726a7c1be1f..a983f6d64d3 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -666,6 +666,12 @@ /* Define to 1 to use ARMv8 CRC Extension with a runtime check. */ #undef USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK +/* Define to 1 to use S390X CRC Extension. */ +#undef USE_S390X_CRC32C + +/* Define to 1 to use ARMv8 CRC Extension with a runtime check. */ +#undef USE_S390X_CRC32C_WITH_RUNTIME_CHECK + /* Define to 1 to build with assertion checks. (--enable-cassert) */ #undef USE_ASSERT_CHECKING diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h index 82313bb7fcf..a1b11d07cd9 100644 --- a/src/include/port/pg_crc32c.h +++ b/src/include/port/pg_crc32c.h @@ -142,6 +142,31 @@ extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len) extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len); extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len); +#elif defined(USE_S390X_CRC32C) + +/* + * Use S390X vector instructions only for buffers longer then 16 byte + */ +#define COMP_CRC32C(crc, data, len) \ + ((crc) = (len) < 16 ? pg_comp_crc32c_sb8((crc),(data),(len)) : pg_comp_crc32c_s390x((crc), (data), (len))) +#define FIN_CRC32C(crc) ((crc) = pg_bswap32(crc) ^ 0xFFFFFFFF) + +extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len); +extern pg_crc32c pg_comp_crc32c_s390x(pg_crc32c crc, const void *data, size_t len); + +#elif defined(USE_S390X_CRC32C_WITH_RUNTIME_CHECK) +/* + * Use S390X vector instructions only for buffers longer then 16 byte. + * For runtime check use pg_comp_crc32c instead of pg_comp_crc32c_s390x + */ +#define COMP_CRC32C(crc, data, len) \ + ((crc) = (len) < 16 ? pg_comp_crc32c_sb8((crc),(data),(len)) : pg_comp_crc32c((crc), (data), (len))) +#define FIN_CRC32C(crc) ((crc) = pg_bswap32(crc) ^ 0xFFFFFFFF) + +extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len); +extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len); +extern pg_crc32c pg_comp_crc32c_s390x(pg_crc32c crc, const void *data, size_t len); + #else /* * Use slicing-by-8 algorithm. diff --git a/src/port/Makefile b/src/port/Makefile index 4274949dfa4..fc2c2907b36 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -85,10 +85,13 @@ libpgport.a: $(OBJS) rm -f $@ $(AR) $(AROPT) $@ $^ -# all versions of pg_crc32c_armv8.o need CFLAGS_CRC +# all versions of pg_crc32c_armv8.o and pg_crc32c_s390x.o need CFLAGS_CRC pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_CRC) +pg_crc32c_s390x.o: CFLAGS+=$(CFLAGS_CRC) +pg_crc32c_s390x_shlib.o: CFLAGS+=$(CFLAGS_CRC) +pg_crc32c_s390x_srv.o: CFLAGS+=$(CFLAGS_CRC) # # Shared library versions of object files diff --git a/src/port/meson.build b/src/port/meson.build index fc7b059fee5..d8284a31f47 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -100,6 +100,13 @@ replace_funcs_pos = [ # loongarch ['pg_crc32c_loongarch', 'USE_LOONGARCH_CRC32C'], + # s390x + ['pg_crc32c_s390x', 'USE_S390X_CRC32C','crc'], + ['pg_crc32c_s390x', 'USE_S390X_CRC32C_WITH_RUNTIME_CHECK', 'crc'], + ['pg_crc32c_s390x_choose', 'USE_S390X_CRC32C_WITH_RUNTIME_CHECK'], + ['pg_crc32c_sb8', 'USE_S390X_CRC32C'], + ['pg_crc32c_sb8', 'USE_S390X_CRC32C_WITH_RUNTIME_CHECK'], + # generic fallback ['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'], ] diff --git a/src/port/pg_crc32c_s390x.c b/src/port/pg_crc32c_s390x.c new file mode 100644 index 00000000000..4fc839c5508 --- /dev/null +++ b/src/port/pg_crc32c_s390x.c @@ -0,0 +1,263 @@ +/*------------------------------------------------------------------------- + * + * pg_crc32c_s390x.c + * Hardware-accelerated CRC-32C variants for Linux on IBM Z & LinuxONE + * + * This code was originally written by Hendrik Brueckner + * <brueck...@linux.ibm.com> for use in the Linux kernel and has been + * relicensed under the postgresql-license. + * + * Use the z/Architecture Vector Extension Facility to accelerate the + * computing of bitreflected CRC-32C checksums. + * + * This CRC-32C implementation algorithm is bitreflected and processes + * the least-significant bit first (Little-Endian). + * + * Copyright (c) 2025, International Business Machines (IBM) + * + * IDENTIFICATION + * src/port/pg_crc32c_s390x.c + * + *------------------------------------------------------------------------- + */ +#include "c.h" +#include <vecintrin.h> + +#include "port/pg_crc32c.h" + +#define VX_ALIGNMENT 16L +#define VX_ALIGN_MASK (VX_ALIGNMENT - 1) + +typedef unsigned char uv16qi __attribute__((vector_size(16))); +typedef unsigned int uv4si __attribute__((vector_size(16))); +typedef unsigned long long uv2di __attribute__((vector_size(16))); + +static uint32_t +crc32_le_vgfm_16(uint32_t crc, const unsigned char *buf, size_t len) +{ + /*---------- + * The CRC-32C constant block contains reduction constants to fold and + * process particular chunks of the input data stream in parallel. + * + * For the CRC-32C variants, the constants are precomputed according to + * these definitions: + * + * R1 = [(x4*128+32 mod P'(x) << 32)]' << 1 + * R2 = [(x4*128-32 mod P'(x) << 32)]' << 1 + * R3 = [(x128+32 mod P'(x) << 32)]' << 1 + * R4 = [(x128-32 mod P'(x) << 32)]' << 1 + * R5 = [(x64 mod P'(x) << 32)]' << 1 + * R6 = [(x32 mod P'(x) << 32)]' << 1 + * + * The bitreflected Barret reduction constant, u', is defined as + * the bit reversal of floor(x**64 / P(x)). + * + * where P(x) is the polynomial in the normal domain and the P'(x) is the + * polynomial in the reversed (bitreflected) domain. + * + * CRC-32C (Castagnoli) polynomials: + * + * P(x) = 0x1EDC6F41 + * P'(x) = 0x82F63B78 + */ + const uv16qi perm_le2be = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; /* BE->LE mask */ + const uv2di r2r1 = {0x09e4addf8, 0x740eef02}; /* R2, R1 */ + const uv2di r4r3 = {0x14cd00bd6, 0xf20c0dfe}; /* R4, R3 */ + const uv2di r5 = {0, 0x0dd45aab8}; /* R5 */ + const uv2di ru_poly = {0, 0x0dea713f1}; /* u' */ + const uv2di crc_poly = {0, 0x105ec76f0}; /* P'(x) << 1 */ + uv2di v0 = {0, 0}; + uv2di v1 = {0, 0}; + uv2di v2 = {0, 0}; + uv2di v3 = {0, 0}; + uv2di v4 = {0, 0}; + uv16qi v9 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + + /* + * Load the initial CRC value. + * + * The CRC value is loaded into the rightmost word of the vector register + * and is later XORed with the LSB portion of the loaded input data. + */ + v0 = (uv2di) vec_insert(crc, (uv4si) v0, 3); + + if (len >= 64) + { + /* Load a 64-byte data chunk and XOR with CRC */ + v1 = vec_perm(((uv2di *) buf)[0], ((uv2di *) buf)[0], perm_le2be); + v2 = vec_perm(((uv2di *) buf)[1], ((uv2di *) buf)[1], perm_le2be); + v3 = vec_perm(((uv2di *) buf)[2], ((uv2di *) buf)[2], perm_le2be); + v4 = vec_perm(((uv2di *) buf)[3], ((uv2di *) buf)[3], perm_le2be); + + v1 ^= v0; + buf += 64; + len -= 64; + + while (len >= 64) + { + /* Load the next 64-byte data chunk */ + uv16qi part1 = vec_perm(((uv16qi *) buf)[0], ((uv16qi *) buf)[0], perm_le2be); + uv16qi part2 = vec_perm(((uv16qi *) buf)[1], ((uv16qi *) buf)[1], perm_le2be); + uv16qi part3 = vec_perm(((uv16qi *) buf)[2], ((uv16qi *) buf)[2], perm_le2be); + uv16qi part4 = vec_perm(((uv16qi *) buf)[3], ((uv16qi *) buf)[3], perm_le2be); + + /* + * Perform a GF(2) multiplication of the doublewords in V1 with + * the R1 and R2 reduction constants in V0. The intermediate + * result is then folded (accumulated) with the next data chunk in + * PART1 and stored in V1. Repeat this step for the register + * contents in V2, V3, and V4 respectively. + */ + v1 = (uv2di) vec_gfmsum_accum_128(r2r1, v1, part1); + v2 = (uv2di) vec_gfmsum_accum_128(r2r1, v2, part2); + v3 = (uv2di) vec_gfmsum_accum_128(r2r1, v3, part3); + v4 = (uv2di) vec_gfmsum_accum_128(r2r1, v4, part4); + + buf += 64; + len -= 64; + } + + /* + * Fold V1 to V4 into a single 128-bit value in V1. Multiply V1 with + * R3 and R4 and accumulating the next 128-bit chunk until a single + * 128-bit value remains. + */ + v1 = (uv2di) vec_gfmsum_accum_128(r4r3, v1, (uv16qi) v2); + v1 = (uv2di) vec_gfmsum_accum_128(r4r3, v1, (uv16qi) v3); + v1 = (uv2di) vec_gfmsum_accum_128(r4r3, v1, (uv16qi) v4); + } + else + { + + /* + * Load a 16-byte data chunk and XOR with CRC + */ + v1 = vec_perm(((uv2di *) buf)[0], ((uv2di *) buf)[0], perm_le2be); + v1 ^= v0; + buf += 16; + len -= 16; + } + + while (len >= 16) + { + /* Load next data chunk */ + v2 = vec_perm(*(uv2di *) buf, *(uv2di *) buf, perm_le2be); + + /* Fold next data chunk */ + v1 = (uv2di) vec_gfmsum_accum_128(r4r3, v1, (uv16qi) v2); + + buf += 16; + len -= 16; + } + + /* + * Set up a vector register for byte shifts. The shift value must be + * loaded in bits 1-4 in byte element 7 of a vector register. Shift by 8 + * bytes: 0x40 Shift by 4 bytes: 0x20 + */ + + v9 = vec_insert((unsigned char) 0x40, v9, 7); + + /* + * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes to + * move R4 into the rightmost doubleword and set the leftmost doubleword + * to 0x1. + */ + v0 = vec_srb(r4r3, (uv2di) v9); + v0[0] = 1; + + /* + * Compute GF(2) product of V1 and V0. The rightmost doubleword of V1 is + * multiplied with R4. The leftmost doubleword of V1 is multiplied by 0x1 + * and is then XORed with rightmost product. Implicitly, the intermediate + * leftmost product becomes padded + */ + v1 = (uv2di) vec_gfmsum_128(v0, v1); + + /* + * Now do the final 32-bit fold by multiplying the rightmost word in V1 + * with R5 and XOR the result with the remaining bits in V1. + * + * To achieve this by a single VGFMAG, right shift V1 by a word and store + * the result in V2 which is then accumulated. Use the vector unpack + * instruction to load the rightmost half of the doubleword into the + * rightmost doubleword element of V1; the other half is loaded in the + * leftmost doubleword. The vector register with CONST_R5 contains the R5 + * constant in the rightmost doubleword and the leftmost doubleword is + * zero to ignore the leftmost product of V1. + */ + v9 = vec_insert((unsigned char) 0x20, v9, 7); + v2 = vec_srb(v1, (uv2di) v9); + v1 = vec_unpackl((uv4si) v1); /* Split rightmost doubleword */ + v1 = (uv2di) vec_gfmsum_accum_128(r5, v1, (uv16qi) v2); + + /*---------- + * Apply a Barret reduction to compute the final 32-bit CRC value. + * + * The input values to the Barret reduction are the degree-63 polynomial + * in V1 (R(x)), degree-32 generator polynomial, and the reduction + * constant u. The Barret reduction result is the CRC value of R(x) mod + * P(x). + * + * The Barret reduction algorithm is defined as: + * + * 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u + * 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x) + * 3. C(x) = R(x) XOR T2(x) mod x^32 + * + * Note: The leftmost doubleword of vector register containing + * CONST_RU_POLY is zero and, thus, the intermediate GF(2) product + * is zero and does not contribute to the final result. + */ + + /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */ + v2 = vec_unpackl((uv4si) v1); + v2 = (uv2di) vec_gfmsum_128(ru_poly, v2); + + /* + * Compute the GF(2) product of the CRC polynomial with T1(x) in V2 and + * XOR the intermediate result, T2(x), with the value in V1. The final + * result is stored in word element 2 of V2. + */ + v2 = vec_unpackl((uv4si) v2); + v2 = (uv2di) vec_gfmsum_accum_128(crc_poly, v2, (uv16qi) v1); + + return ((uv4si) v2)[2]; +} + +pg_crc32c +pg_comp_crc32c_s390x(pg_crc32c crc, const void *data, size_t len) +{ + uintptr_t prealign, + aligned, + remaining; + const unsigned char *buf = data; + + /* + * Preprocess initial bytes with sb8 so the hw can start at an aligned + * address + */ + if ((uintptr_t) buf & VX_ALIGN_MASK) + { + prealign = VX_ALIGNMENT - ((uintptr_t) buf & VX_ALIGN_MASK); + len -= prealign; + crc = pg_comp_crc32c_sb8(crc, buf, prealign); + buf += prealign; + } + aligned = len & ~VX_ALIGN_MASK; + remaining = len & VX_ALIGN_MASK; + + /* Process major part of the data with hw acceleration */ + if (aligned) + { + crc = pg_bswap32(crc32_le_vgfm_16(pg_bswap32(crc), buf, (size_t) aligned)); + } + + /* Process remaining bytes that could not be handled by hw */ + if (remaining) + { + crc = pg_comp_crc32c_sb8(crc, buf + aligned, remaining); + } + + return crc; +} diff --git a/src/port/pg_crc32c_s390x_choose.c b/src/port/pg_crc32c_s390x_choose.c new file mode 100644 index 00000000000..124f0ce71c3 --- /dev/null +++ b/src/port/pg_crc32c_s390x_choose.c @@ -0,0 +1,43 @@ +/*------------------------------------------------------------------------- + * + * pg_crc32c_s390x_choose.c + * Choose between S390X vectorized CRC-32C and software CRC-32C + * implementation. + * + * On first call, checks if the CPU we're running on supports the + * S390X_VX Extension. If it does, use the special instructions for + * CRC-32C computation. Otherwise, fall back to the pure software + * implementation (slicing-by-8). + * + * Copyright (c) 2025, International Business Machines (IBM) + * + * IDENTIFICATION + * src/port/pg_crc32c_s390x_choose.c + * + *------------------------------------------------------------------------- + */ + +#include "c.h" +#include "port/pg_crc32c.h" + +#include <sys/auxv.h> + +/* + * This gets called on the first call. It replaces the function pointer + * so that subsequent calls are routed directly to the chosen implementation. + */ +static pg_crc32c +pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) +{ + /* default call sb8 */ + pg_comp_crc32c = pg_comp_crc32c_sb8; + + if (getauxval(AT_HWCAP) & HWCAP_S390_VX) + { + pg_comp_crc32c = pg_comp_crc32c_s390x; + } + + return pg_comp_crc32c(crc, data, len); +} + +pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose; -- 2.49.0