[V2] Adding new CRC32C implementation for IBM S390X

Eduard Stefes Thu, 05 Jun 2025 00:26:10 -0700

Hi,

So here is V2 of the crc32c_s390x patch. Changes from V1 are:


- added gcc 14-14.2 as known broken compiler (bug was fixed with 14.3)
- moved broken compiler check to vx extension compile&link check
- made variables global in the extension check
- create dependency to getauxval in configure, so we don't compile the
code if we won't be able to detect the cpu extension at runtime
- moved buffer length check into macro
- changed minimal buffer length for crc32c_s390x from 64 to 16 byte
- added CFLAGS_CRC to all crc32c_s390x* artifacts
- fixed formatting with pgindent 
- fixed typos in email address


-- 
Eduard Stefes <[email protected]>

From b0b9ed50cec3a9f5e856659617883c8a1b926991 Mon Sep 17 00:00:00 2001
From: "Eddy (Eduard) Stefes" <[email protected]>
Date: Tue, 15 Apr 2025 10:22:05 +0200
Subject: [PATCH v2] Added crc32c extension for ibm s390x based on VX
 intrinsics

---
 config/c-compiler.m4              |  47 ++++++
 configure                         | 262 ++++++++++++++++++++++++++++-
 configure.ac                      |  53 +++++-
 meson.build                       |  43 +++++
 src/include/pg_config.h.in        |   6 +
 src/include/port/pg_crc32c.h      |  25 +++
 src/port/Makefile                 |   5 +-
 src/port/meson.build              |   7 +
 src/port/pg_crc32c_s390x.c        | 263 ++++++++++++++++++++++++++++++
 src/port/pg_crc32c_s390x_choose.c |  43 +++++
 10 files changed, 741 insertions(+), 13 deletions(-)
 create mode 100644 src/port/pg_crc32c_s390x.c
 create mode 100644 src/port/pg_crc32c_s390x_choose.c

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 5f3e1d1faf9..47cb53dbd56 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -684,6 +684,53 @@ fi
 undefine([Ac_cachevar])dnl
 ])# PGAC_LOONGARCH_CRC32C_INTRINSICS
 
+# PGAC_S390X_VECTOR_VX_INTRINSICS
+# --------------------------------
+# Check if the compiler supports the S390X vector intrinsics, using
+# __attribute__((vector_size(16))), vec_gfmsum_accum_128
+#
+# These instructions where introduced with -march=z13.
+# the test arg1 is mandatory and should be either:
+# '-fzvector' for clang
+# '-mzarch' for gcc
+#
+# If the intrinsics are supported, sets
+# pgac_s390x_vector_intrinsics, and CFLAGS_CRC.
+AC_DEFUN([PGAC_S390X_VECTOR_VX_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_s390x_vector_intrinsics_$1_$2])])dnl
+AC_CACHE_CHECK([for __attribute__((vector_size(16))), vec_gfmsum_accum_128 with CFLAGS=$1 $2], [Ac_cachevar],
+[pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS $1 $2"
+AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <vecintrin.h>
+    #ifdef __clang__
+    #  if ((__clang_major__ == 18) || (__clang_major__ == 19 && (__clang_minor__ < 1 || (__clang_minor__ == 1 && __clang_patchlevel__ < 2))))
+    # error CRC32-VX optimizations are broken due to compiler bug in Clang versions: 18.0.0 <= clang_version < 19.1.2. Either disable the CRC32-VX optimization, or switch to another compiler/compiler version.
+    #  endif
+    #endif
+    #ifdef __GNUC__
+    # if ((__GNUC__ == 14) && (__GNUC_MINOR__ < 3))
+    #  error CRC32-VX optimizations are broken due to compiler bug in GCC versions: 14.0.0 <= gcc_version < 14.3.0 Either disable the CRC32-VX optimization, or switch to another compiler/compiler version.
+    # endif
+    #endif
+    unsigned long long a __attribute__((vector_size(16))) = { 0 };
+    unsigned long long b __attribute__((vector_size(16))) = { 0 };
+    unsigned char c __attribute__((vector_size(16))) = { 0 };
+    static void vecint_gfmsum_accum_test(void){
+        c = vec_gfmsum_accum_128(a, b, c);
+    }],
+  [
+   vecint_gfmsum_accum_test();
+   return 0;])],
+  [Ac_cachevar=yes],
+  [Ac_cachevar=no])
+CFLAGS="$pgac_save_CFLAGS"])
+if test x"$Ac_cachevar" = x"yes"; then
+  CFLAGS_CRC="$1 $2"
+  AS_TR_SH([pgac_s390x_vector_intrinsics_$1_$2])=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_S390X_VECTOR_VX_INTRINSICS
+
 # PGAC_XSAVE_INTRINSICS
 # ---------------------
 # Check if the compiler supports the XSAVE instructions using the _xgetbv
diff --git a/configure b/configure
index 4f15347cc95..0f2ec308eaa 100755
--- a/configure
+++ b/configure
@@ -17541,7 +17541,6 @@ $as_echo "#define HAVE_GCC__ATOMIC_INT64_CAS 1" >>confdefs.h
 
 fi
 
-
 # Check for x86 cpuid instruction
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __get_cpuid" >&5
 $as_echo_n "checking for __get_cpuid... " >&6; }
@@ -18088,6 +18087,227 @@ if test x"$pgac_cv_loongarch_crc32c_intrinsics" = x"yes"; then
 fi
 
 
+# Check for S390X Vector intrinsics to do CRC calculations.
+#
+# First check for the host cpu
+if test x"$host_cpu" = x"s390x" && test x"$ac_cv_func_getauxval" = x"yes"; then
+  # Check for all possible cflag combinations
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __attribute__((vector_size(16))), vec_gfmsum_accum_128 with CFLAGS=-fzvector " >&5
+$as_echo_n "checking for __attribute__((vector_size(16))), vec_gfmsum_accum_128 with CFLAGS=-fzvector ... " >&6; }
+if ${pgac_cv_s390x_vector_intrinsics__fzvector_+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS -fzvector "
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <vecintrin.h>
+    #ifdef __clang__
+    #  if ((__clang_major__ == 18) || (__clang_major__ == 19 && (__clang_minor__ < 1 || (__clang_minor__ == 1 && __clang_patchlevel__ < 2))))
+    # error CRC32-VX optimizations are broken due to compiler bug in Clang versions: 18.0.0 <= clang_version < 19.1.2. Either disable the CRC32-VX optimization, or switch to another compiler/compiler version.
+    #  endif
+    #endif
+    #ifdef __GNUC__
+    # if ((__GNUC__ == 14) && (__GNUC_MINOR__ < 3))
+    #  error CRC32-VX optimizations are broken due to compiler bug in GCC versions: 14.0.0 <= gcc_version < 14.3.0 Either disable the CRC32-VX optimization, or switch to another compiler/compiler version.
+    # endif
+    #endif
+    unsigned long long a __attribute__((vector_size(16))) = { 0 };
+    unsigned long long b __attribute__((vector_size(16))) = { 0 };
+    unsigned char c __attribute__((vector_size(16))) = { 0 };
+    static void vecint_gfmsum_accum_test(void){
+        c = vec_gfmsum_accum_128(a, b, c);
+    }
+int
+main ()
+{
+
+   vecint_gfmsum_accum_test();
+   return 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_s390x_vector_intrinsics__fzvector_=yes
+else
+  pgac_cv_s390x_vector_intrinsics__fzvector_=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_s390x_vector_intrinsics__fzvector_" >&5
+$as_echo "$pgac_cv_s390x_vector_intrinsics__fzvector_" >&6; }
+if test x"$pgac_cv_s390x_vector_intrinsics__fzvector_" = x"yes"; then
+  CFLAGS_CRC="-fzvector "
+  pgac_s390x_vector_intrinsics__fzvector_=yes
+fi
+
+  if test x"$pgac_s390x_vector_intrinsics__fzvector_" != x"yes"; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __attribute__((vector_size(16))), vec_gfmsum_accum_128 with CFLAGS=-fzvector -march=z13" >&5
+$as_echo_n "checking for __attribute__((vector_size(16))), vec_gfmsum_accum_128 with CFLAGS=-fzvector -march=z13... " >&6; }
+if ${pgac_cv_s390x_vector_intrinsics__fzvector__march_z13+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS -fzvector -march=z13"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <vecintrin.h>
+    #ifdef __clang__
+    #  if ((__clang_major__ == 18) || (__clang_major__ == 19 && (__clang_minor__ < 1 || (__clang_minor__ == 1 && __clang_patchlevel__ < 2))))
+    # error CRC32-VX optimizations are broken due to compiler bug in Clang versions: 18.0.0 <= clang_version < 19.1.2. Either disable the CRC32-VX optimization, or switch to another compiler/compiler version.
+    #  endif
+    #endif
+    #ifdef __GNUC__
+    # if ((__GNUC__ == 14) && (__GNUC_MINOR__ < 3))
+    #  error CRC32-VX optimizations are broken due to compiler bug in GCC versions: 14.0.0 <= gcc_version < 14.3.0 Either disable the CRC32-VX optimization, or switch to another compiler/compiler version.
+    # endif
+    #endif
+    unsigned long long a __attribute__((vector_size(16))) = { 0 };
+    unsigned long long b __attribute__((vector_size(16))) = { 0 };
+    unsigned char c __attribute__((vector_size(16))) = { 0 };
+    static void vecint_gfmsum_accum_test(void){
+        c = vec_gfmsum_accum_128(a, b, c);
+    }
+int
+main ()
+{
+
+   vecint_gfmsum_accum_test();
+   return 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_s390x_vector_intrinsics__fzvector__march_z13=yes
+else
+  pgac_cv_s390x_vector_intrinsics__fzvector__march_z13=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_s390x_vector_intrinsics__fzvector__march_z13" >&5
+$as_echo "$pgac_cv_s390x_vector_intrinsics__fzvector__march_z13" >&6; }
+if test x"$pgac_cv_s390x_vector_intrinsics__fzvector__march_z13" = x"yes"; then
+  CFLAGS_CRC="-fzvector -march=z13"
+  pgac_s390x_vector_intrinsics__fzvector__march_z13=yes
+fi
+
+    if test x"$pgac_s390x_vector_intrinsics__fzvector__march_z13" != x"yes"; then
+      { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __attribute__((vector_size(16))), vec_gfmsum_accum_128 with CFLAGS=-mzarch " >&5
+$as_echo_n "checking for __attribute__((vector_size(16))), vec_gfmsum_accum_128 with CFLAGS=-mzarch ... " >&6; }
+if ${pgac_cv_s390x_vector_intrinsics__mzarch_+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS -mzarch "
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <vecintrin.h>
+    #ifdef __clang__
+    #  if ((__clang_major__ == 18) || (__clang_major__ == 19 && (__clang_minor__ < 1 || (__clang_minor__ == 1 && __clang_patchlevel__ < 2))))
+    # error CRC32-VX optimizations are broken due to compiler bug in Clang versions: 18.0.0 <= clang_version < 19.1.2. Either disable the CRC32-VX optimization, or switch to another compiler/compiler version.
+    #  endif
+    #endif
+    #ifdef __GNUC__
+    # if ((__GNUC__ == 14) && (__GNUC_MINOR__ < 3))
+    #  error CRC32-VX optimizations are broken due to compiler bug in GCC versions: 14.0.0 <= gcc_version < 14.3.0 Either disable the CRC32-VX optimization, or switch to another compiler/compiler version.
+    # endif
+    #endif
+    unsigned long long a __attribute__((vector_size(16))) = { 0 };
+    unsigned long long b __attribute__((vector_size(16))) = { 0 };
+    unsigned char c __attribute__((vector_size(16))) = { 0 };
+    static void vecint_gfmsum_accum_test(void){
+        c = vec_gfmsum_accum_128(a, b, c);
+    }
+int
+main ()
+{
+
+   vecint_gfmsum_accum_test();
+   return 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_s390x_vector_intrinsics__mzarch_=yes
+else
+  pgac_cv_s390x_vector_intrinsics__mzarch_=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_s390x_vector_intrinsics__mzarch_" >&5
+$as_echo "$pgac_cv_s390x_vector_intrinsics__mzarch_" >&6; }
+if test x"$pgac_cv_s390x_vector_intrinsics__mzarch_" = x"yes"; then
+  CFLAGS_CRC="-mzarch "
+  pgac_s390x_vector_intrinsics__mzarch_=yes
+fi
+
+      if test x"$pgac_s390x_vector_intrinsics__mzarch_" != x"yes"; then
+        { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __attribute__((vector_size(16))), vec_gfmsum_accum_128 with CFLAGS=-mzarch -march=z13" >&5
+$as_echo_n "checking for __attribute__((vector_size(16))), vec_gfmsum_accum_128 with CFLAGS=-mzarch -march=z13... " >&6; }
+if ${pgac_cv_s390x_vector_intrinsics__mzarch__march_z13+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS -mzarch -march=z13"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <vecintrin.h>
+    #ifdef __clang__
+    #  if ((__clang_major__ == 18) || (__clang_major__ == 19 && (__clang_minor__ < 1 || (__clang_minor__ == 1 && __clang_patchlevel__ < 2))))
+    # error CRC32-VX optimizations are broken due to compiler bug in Clang versions: 18.0.0 <= clang_version < 19.1.2. Either disable the CRC32-VX optimization, or switch to another compiler/compiler version.
+    #  endif
+    #endif
+    #ifdef __GNUC__
+    # if ((__GNUC__ == 14) && (__GNUC_MINOR__ < 3))
+    #  error CRC32-VX optimizations are broken due to compiler bug in GCC versions: 14.0.0 <= gcc_version < 14.3.0 Either disable the CRC32-VX optimization, or switch to another compiler/compiler version.
+    # endif
+    #endif
+    unsigned long long a __attribute__((vector_size(16))) = { 0 };
+    unsigned long long b __attribute__((vector_size(16))) = { 0 };
+    unsigned char c __attribute__((vector_size(16))) = { 0 };
+    static void vecint_gfmsum_accum_test(void){
+        c = vec_gfmsum_accum_128(a, b, c);
+    }
+int
+main ()
+{
+
+   vecint_gfmsum_accum_test();
+   return 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_s390x_vector_intrinsics__mzarch__march_z13=yes
+else
+  pgac_cv_s390x_vector_intrinsics__mzarch__march_z13=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_s390x_vector_intrinsics__mzarch__march_z13" >&5
+$as_echo "$pgac_cv_s390x_vector_intrinsics__mzarch__march_z13" >&6; }
+if test x"$pgac_cv_s390x_vector_intrinsics__mzarch__march_z13" = x"yes"; then
+  CFLAGS_CRC="-mzarch -march=z13"
+  pgac_s390x_vector_intrinsics__mzarch__march_z13=yes
+fi
+
+      fi
+    fi
+  fi
+fi
+
 
 
 # Select CRC-32C implementation.
@@ -18140,9 +18360,21 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
           if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
             USE_LOONGARCH_CRC32C=1
           else
-            # fall back to slicing-by-8 algorithm, which doesn't require any
-            # special CPU support.
-            USE_SLICING_BY_8_CRC32C=1
+            # Use S390X vector extension.
+            if (test x"$pgac_s390x_vector_intrinsics__fzvector_" = x"yes" ||
+               test x"$pgac_s390x_vector_intrinsics__mzarch_" = x"yes"); then
+              USE_S390X_CRC32C=1
+            else
+              # Use S390X vector extension with runtime check.
+              if test x"$pgac_s390x_vector_intrinsics__fzvector__march_z13" = x"yes" ||
+               test x"$pgac_s390x_vector_intrinsics__mzarch__march_z13" = x"yes"; then
+              USE_S390X_CRC32C_WITH_RUNTIME_CHECK=1
+              else
+                # fall back to slicing-by-8 algorithm, which doesn't require any
+                # special CPU support.
+                USE_SLICING_BY_8_CRC32C=1
+              fi
+            fi
           fi
         fi
       fi
@@ -18193,12 +18425,30 @@ $as_echo "#define USE_LOONGARCH_CRC32C 1" >>confdefs.h
           { $as_echo "$as_me:${as_lineno-$LINENO}: result: LoongArch CRCC instructions" >&5
 $as_echo "LoongArch CRCC instructions" >&6; }
         else
+          if test x"$USE_S390X_CRC32C" = x"1"; then
+
+$as_echo "#define USE_S390X_CRC32C 1" >>confdefs.h
+
+            PG_CRC32C_OBJS="pg_crc32c_s390x.o pg_crc32c_sb8.o"
+            { $as_echo "$as_me:${as_lineno-$LINENO}: result: S390X Vector instructions for CRC" >&5
+$as_echo "S390X Vector instructions for CRC" >&6; }
+          else
+            if test x"$USE_S390X_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
+
+$as_echo "#define USE_S390X_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+              PG_CRC32C_OBJS="pg_crc32c_s390x.o pg_crc32c_s390x_choose.o pg_crc32c_sb8.o"
+              { $as_echo "$as_me:${as_lineno-$LINENO}: result: S390X Vector instructions for CRC with runtime check" >&5
+$as_echo "S390X Vector instructions for CRC with runtime check" >&6; }
+            else
 
 $as_echo "#define USE_SLICING_BY_8_CRC32C 1" >>confdefs.h
 
-          PG_CRC32C_OBJS="pg_crc32c_sb8.o"
-          { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5
+              PG_CRC32C_OBJS="pg_crc32c_sb8.o"
+              { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5
 $as_echo "slicing-by-8" >&6; }
+            fi
+          fi
         fi
       fi
     fi
diff --git a/configure.ac b/configure.ac
index 4b8335dc613..f431965c21c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2139,6 +2139,23 @@ fi
 # with the default compiler flags.
 PGAC_LOONGARCH_CRC32C_INTRINSICS()
 
+# Check for S390X Vector intrinsics to do CRC calculations.
+#
+# First check for the host cpu
+if test x"$host_cpu" = x"s390x" && test x"$ac_cv_func_getauxval" = x"yes"; then
+  # Check for all possible cflag combinations
+  PGAC_S390X_VECTOR_VX_INTRINSICS([-fzvector], [])
+  if test x"$pgac_s390x_vector_intrinsics__fzvector_" != x"yes"; then
+    PGAC_S390X_VECTOR_VX_INTRINSICS([-fzvector], [-march=z13])
+    if test x"$pgac_s390x_vector_intrinsics__fzvector__march_z13" != x"yes"; then
+      PGAC_S390X_VECTOR_VX_INTRINSICS([-mzarch], [])
+      if test x"$pgac_s390x_vector_intrinsics__mzarch_" != x"yes"; then
+        PGAC_S390X_VECTOR_VX_INTRINSICS([-mzarch], [-march=z13])
+      fi
+    fi
+  fi
+fi
+
 AC_SUBST(CFLAGS_CRC)
 
 # Select CRC-32C implementation.
@@ -2191,9 +2208,21 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
           if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
             USE_LOONGARCH_CRC32C=1
           else
-            # fall back to slicing-by-8 algorithm, which doesn't require any
-            # special CPU support.
-            USE_SLICING_BY_8_CRC32C=1
+            # Use S390X vector extension.
+            if (test x"$pgac_s390x_vector_intrinsics__fzvector_" = x"yes" ||
+               test x"$pgac_s390x_vector_intrinsics__mzarch_" = x"yes"); then
+              USE_S390X_CRC32C=1
+            else
+              # Use S390X vector extension with runtime check.
+              if test x"$pgac_s390x_vector_intrinsics__fzvector__march_z13" = x"yes" ||
+               test x"$pgac_s390x_vector_intrinsics__mzarch__march_z13" = x"yes"; then
+              USE_S390X_CRC32C_WITH_RUNTIME_CHECK=1
+              else
+                # fall back to slicing-by-8 algorithm, which doesn't require any
+                # special CPU support.
+                USE_SLICING_BY_8_CRC32C=1
+              fi
+            fi
           fi
         fi
       fi
@@ -2228,9 +2257,21 @@ else
           PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
           AC_MSG_RESULT(LoongArch CRCC instructions)
         else
-          AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).])
-          PG_CRC32C_OBJS="pg_crc32c_sb8.o"
-          AC_MSG_RESULT(slicing-by-8)
+          if test x"$USE_S390X_CRC32C" = x"1"; then
+            AC_DEFINE(USE_S390X_CRC32C, 1, [Define to 1 to use S390X Vector instructions for CRC.])
+            PG_CRC32C_OBJS="pg_crc32c_s390x.o pg_crc32c_sb8.o"
+            AC_MSG_RESULT(S390X Vector instructions for CRC)
+          else
+            if test x"$USE_S390X_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
+              AC_DEFINE(USE_S390X_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use S390X Vector instructions with runtime check for CRC.])
+              PG_CRC32C_OBJS="pg_crc32c_s390x.o pg_crc32c_s390x_choose.o pg_crc32c_sb8.o"
+              AC_MSG_RESULT(S390X Vector instructions for CRC with runtime check)
+            else
+              AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).])
+              PG_CRC32C_OBJS="pg_crc32c_sb8.o"
+              AC_MSG_RESULT(slicing-by-8)
+            fi
+          fi
         fi
       fi
     fi
diff --git a/meson.build b/meson.build
index d142e3e408b..3177c6e1b78 100644
--- a/meson.build
+++ b/meson.build
@@ -2547,6 +2547,49 @@ int main(void)
     have_optimized_crc = true
   endif
 
+elif host_cpu == 's390x' and cc.has_function('getauxval')
+  if cc.get_id() == 'clang'
+    VGFMAFLAG='-fzvector'
+  else
+    VGFMAFLAG='-mzarch'
+  endif
+
+  prog = '''
+#ifdef __clang__
+#  if ((__clang_major__ == 18) || (__clang_major__ == 19 && (__clang_minor__ < 1 || (__clang_minor__ == 1 && __clang_patchlevel__ < 2))))
+# error CRC32-VX optimizations are broken due to compiler bug in Clang versions: 18.0.0 <= clang_version < 19.1.2. Either disable the CRC32-VX optimization, or switch to another compiler/compiler version.
+#  endif
+#endif
+#ifdef __GNUC__
+# if ((__GNUC__ == 14) && (__GNUC_MINOR__ < 3))
+#  error CRC32-VX optimizations are broken due to compiler bug in GCC versions: 14.0.0 <= gcc_version < 14.3.0 Either disable the CRC32-VX optimization, or switch to another compiler/compiler version.
+# endif
+#endif
+#include <vecintrin.h>
+unsigned long long a __attribute__((vector_size(16))) = { 0 };
+unsigned long long b __attribute__((vector_size(16))) = { 0 };
+unsigned char c __attribute__((vector_size(16))) = { 0 };
+
+int main(void) {
+    // test for vector extension
+    // we can safely assume that if we have vec_gfmsum_accum_128
+    // we will have all needed builtins
+    c = vec_gfmsum_accum_128(a, b, c);
+    return c[0];
+}'''
+  if cc.links(prog, name: 's390x_vector_vx', args: test_c_args + [VGFMAFLAG])
+    # Use S390X CRC Extension unconditionally
+    cdata.set('USE_S390X_CRC32C',1)
+    cflags_crc += VGFMAFLAG
+    have_optimized_crc = true
+  elif cc.links(prog, name: 's390x_vector_vx+march=z13', args: test_c_args + [VGFMAFLAG, '-march=z13'])
+    # Use S390X CRC Extension, with runtime check
+    cdata.set('USE_S390X_CRC32C',false)
+    cdata.set('USE_S390X_CRC32C_WITH_RUNTIME_CHECK',1)
+    cflags_crc += VGFMAFLAG
+    cflags_crc += '-march=z13'
+    have_optimized_crc = true
+  endif
 endif
 
 if not have_optimized_crc
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 726a7c1be1f..a983f6d64d3 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -666,6 +666,12 @@
 /* Define to 1 to use ARMv8 CRC Extension with a runtime check. */
 #undef USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK
 
+/* Define to 1 to use S390X CRC Extension. */
+#undef USE_S390X_CRC32C
+
+/* Define to 1 to use ARMv8 CRC Extension with a runtime check. */
+#undef USE_S390X_CRC32C_WITH_RUNTIME_CHECK
+
 /* Define to 1 to build with assertion checks. (--enable-cassert) */
 #undef USE_ASSERT_CHECKING
 
diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h
index 82313bb7fcf..a1b11d07cd9 100644
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -142,6 +142,31 @@ extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len)
 extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
 extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
 
+#elif defined(USE_S390X_CRC32C)
+
+/*
+ * Use S390X vector instructions only for buffers longer then 16 byte
+ */
+#define COMP_CRC32C(crc, data, len)							\
+	((crc) = (len) < 16 ? pg_comp_crc32c_sb8((crc),(data),(len)) : pg_comp_crc32c_s390x((crc), (data), (len)))
+#define FIN_CRC32C(crc) ((crc) = pg_bswap32(crc) ^ 0xFFFFFFFF)
+
+extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
+extern pg_crc32c pg_comp_crc32c_s390x(pg_crc32c crc, const void *data, size_t len);
+
+#elif defined(USE_S390X_CRC32C_WITH_RUNTIME_CHECK)
+/*
+ * Use S390X vector instructions only for buffers longer then 16 byte.
+ * For runtime check use pg_comp_crc32c instead of pg_comp_crc32c_s390x
+ */
+#define COMP_CRC32C(crc, data, len)							\
+	((crc) = (len) < 16 ? pg_comp_crc32c_sb8((crc),(data),(len)) : pg_comp_crc32c((crc), (data), (len)))
+#define FIN_CRC32C(crc) ((crc) = pg_bswap32(crc) ^ 0xFFFFFFFF)
+
+extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
+extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
+extern pg_crc32c pg_comp_crc32c_s390x(pg_crc32c crc, const void *data, size_t len);
+
 #else
 /*
  * Use slicing-by-8 algorithm.
diff --git a/src/port/Makefile b/src/port/Makefile
index 4274949dfa4..fc2c2907b36 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -85,10 +85,13 @@ libpgport.a: $(OBJS)
 	rm -f $@
 	$(AR) $(AROPT) $@ $^
 
-# all versions of pg_crc32c_armv8.o need CFLAGS_CRC
+# all versions of pg_crc32c_armv8.o and pg_crc32c_s390x.o need CFLAGS_CRC
 pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_CRC)
+pg_crc32c_s390x.o: CFLAGS+=$(CFLAGS_CRC)
+pg_crc32c_s390x_shlib.o: CFLAGS+=$(CFLAGS_CRC)
+pg_crc32c_s390x_srv.o: CFLAGS+=$(CFLAGS_CRC)
 
 #
 # Shared library versions of object files
diff --git a/src/port/meson.build b/src/port/meson.build
index fc7b059fee5..d8284a31f47 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -100,6 +100,13 @@ replace_funcs_pos = [
   # loongarch
   ['pg_crc32c_loongarch', 'USE_LOONGARCH_CRC32C'],
 
+  # s390x
+  ['pg_crc32c_s390x', 'USE_S390X_CRC32C','crc'],
+  ['pg_crc32c_s390x', 'USE_S390X_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
+  ['pg_crc32c_s390x_choose', 'USE_S390X_CRC32C_WITH_RUNTIME_CHECK'],
+  ['pg_crc32c_sb8', 'USE_S390X_CRC32C'],
+  ['pg_crc32c_sb8', 'USE_S390X_CRC32C_WITH_RUNTIME_CHECK'],
+
   # generic fallback
   ['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'],
 ]
diff --git a/src/port/pg_crc32c_s390x.c b/src/port/pg_crc32c_s390x.c
new file mode 100644
index 00000000000..4fc839c5508
--- /dev/null
+++ b/src/port/pg_crc32c_s390x.c
@@ -0,0 +1,263 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_crc32c_s390x.c
+ *	  Hardware-accelerated CRC-32C variants for Linux on IBM Z & LinuxONE
+ *
+ * This code was originally written by Hendrik Brueckner
+ * <[email protected]> for use in the Linux kernel and has been
+ * relicensed under the postgresql-license.
+ *
+ * Use the z/Architecture Vector Extension Facility to accelerate the
+ * computing of bitreflected CRC-32C checksums.
+ *
+ * This CRC-32C implementation algorithm is bitreflected and processes
+ * the least-significant bit first (Little-Endian).
+ *
+ * Copyright (c) 2025, International Business Machines (IBM)
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_crc32c_s390x.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+#include <vecintrin.h>
+
+#include "port/pg_crc32c.h"
+
+#define VX_ALIGNMENT 16L
+#define VX_ALIGN_MASK (VX_ALIGNMENT - 1)
+
+typedef unsigned char uv16qi __attribute__((vector_size(16)));
+typedef unsigned int uv4si __attribute__((vector_size(16)));
+typedef unsigned long long uv2di __attribute__((vector_size(16)));
+
+static uint32_t
+crc32_le_vgfm_16(uint32_t crc, const unsigned char *buf, size_t len)
+{
+	/*----------
+	 * The CRC-32C constant block contains reduction constants to fold and
+	 * process particular chunks of the input data stream in parallel.
+	 *
+	 * For the CRC-32C variants, the constants are precomputed according to
+	 * these definitions:
+	 *
+	 *      R1 = [(x4*128+32 mod P'(x) << 32)]' << 1
+	 *      R2 = [(x4*128-32 mod P'(x) << 32)]' << 1
+	 *      R3 = [(x128+32 mod P'(x) << 32)]'   << 1
+	 *      R4 = [(x128-32 mod P'(x) << 32)]'   << 1
+	 *      R5 = [(x64 mod P'(x) << 32)]'       << 1
+	 *      R6 = [(x32 mod P'(x) << 32)]'       << 1
+	 *
+	 *      The bitreflected Barret reduction constant, u', is defined as
+	 *      the bit reversal of floor(x**64 / P(x)).
+	 *
+	 *      where P(x) is the polynomial in the normal domain and the P'(x) is the
+	 *      polynomial in the reversed (bitreflected) domain.
+	 *
+	 * CRC-32C (Castagnoli) polynomials:
+	 *
+	 *      P(x)  = 0x1EDC6F41
+	 *      P'(x) = 0x82F63B78
+	 */
+	const		uv16qi perm_le2be = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; /* BE->LE mask */
+	const		uv2di r2r1 = {0x09e4addf8, 0x740eef02}; /* R2, R1 */
+	const		uv2di r4r3 = {0x14cd00bd6, 0xf20c0dfe}; /* R4, R3 */
+	const		uv2di r5 = {0, 0x0dd45aab8};	/* R5 */
+	const		uv2di ru_poly = {0, 0x0dea713f1};	/* u' */
+	const		uv2di crc_poly = {0, 0x105ec76f0};	/* P'(x) << 1 */
+	uv2di		v0 = {0, 0};
+	uv2di		v1 = {0, 0};
+	uv2di		v2 = {0, 0};
+	uv2di		v3 = {0, 0};
+	uv2di		v4 = {0, 0};
+	uv16qi		v9 = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+	/*
+	 * Load the initial CRC value.
+	 *
+	 * The CRC value is loaded into the rightmost word of the vector register
+	 * and is later XORed with the LSB portion of the loaded input data.
+	 */
+	v0 = (uv2di) vec_insert(crc, (uv4si) v0, 3);
+
+	if (len >= 64)
+	{
+		/* Load a 64-byte data chunk and XOR with CRC */
+		v1 = vec_perm(((uv2di *) buf)[0], ((uv2di *) buf)[0], perm_le2be);
+		v2 = vec_perm(((uv2di *) buf)[1], ((uv2di *) buf)[1], perm_le2be);
+		v3 = vec_perm(((uv2di *) buf)[2], ((uv2di *) buf)[2], perm_le2be);
+		v4 = vec_perm(((uv2di *) buf)[3], ((uv2di *) buf)[3], perm_le2be);
+
+		v1 ^= v0;
+		buf += 64;
+		len -= 64;
+
+		while (len >= 64)
+		{
+			/* Load the next 64-byte data chunk */
+			uv16qi		part1 = vec_perm(((uv16qi *) buf)[0], ((uv16qi *) buf)[0], perm_le2be);
+			uv16qi		part2 = vec_perm(((uv16qi *) buf)[1], ((uv16qi *) buf)[1], perm_le2be);
+			uv16qi		part3 = vec_perm(((uv16qi *) buf)[2], ((uv16qi *) buf)[2], perm_le2be);
+			uv16qi		part4 = vec_perm(((uv16qi *) buf)[3], ((uv16qi *) buf)[3], perm_le2be);
+
+			/*
+			 * Perform a GF(2) multiplication of the doublewords in V1 with
+			 * the R1 and R2 reduction constants in V0.  The intermediate
+			 * result is then folded (accumulated) with the next data chunk in
+			 * PART1 and stored in V1. Repeat this step for the register
+			 * contents in V2, V3, and V4 respectively.
+			 */
+			v1 = (uv2di) vec_gfmsum_accum_128(r2r1, v1, part1);
+			v2 = (uv2di) vec_gfmsum_accum_128(r2r1, v2, part2);
+			v3 = (uv2di) vec_gfmsum_accum_128(r2r1, v3, part3);
+			v4 = (uv2di) vec_gfmsum_accum_128(r2r1, v4, part4);
+
+			buf += 64;
+			len -= 64;
+		}
+
+		/*
+		 * Fold V1 to V4 into a single 128-bit value in V1.  Multiply V1 with
+		 * R3 and R4 and accumulating the next 128-bit chunk until a single
+		 * 128-bit value remains.
+		 */
+		v1 = (uv2di) vec_gfmsum_accum_128(r4r3, v1, (uv16qi) v2);
+		v1 = (uv2di) vec_gfmsum_accum_128(r4r3, v1, (uv16qi) v3);
+		v1 = (uv2di) vec_gfmsum_accum_128(r4r3, v1, (uv16qi) v4);
+	}
+	else
+	{
+
+		/*
+		 * Load a 16-byte data chunk and XOR with CRC
+		 */
+		v1 = vec_perm(((uv2di *) buf)[0], ((uv2di *) buf)[0], perm_le2be);
+		v1 ^= v0;
+		buf += 16;
+		len -= 16;
+	}
+
+	while (len >= 16)
+	{
+		/* Load next data chunk */
+		v2 = vec_perm(*(uv2di *) buf, *(uv2di *) buf, perm_le2be);
+
+		/* Fold next data chunk */
+		v1 = (uv2di) vec_gfmsum_accum_128(r4r3, v1, (uv16qi) v2);
+
+		buf += 16;
+		len -= 16;
+	}
+
+	/*
+	 * Set up a vector register for byte shifts.  The shift value must be
+	 * loaded in bits 1-4 in byte element 7 of a vector register. Shift by 8
+	 * bytes: 0x40 Shift by 4 bytes: 0x20
+	 */
+
+	v9 = vec_insert((unsigned char) 0x40, v9, 7);
+
+	/*
+	 * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes to
+	 * move R4 into the rightmost doubleword and set the leftmost doubleword
+	 * to 0x1.
+	 */
+	v0 = vec_srb(r4r3, (uv2di) v9);
+	v0[0] = 1;
+
+	/*
+	 * Compute GF(2) product of V1 and V0.  The rightmost doubleword of V1 is
+	 * multiplied with R4.  The leftmost doubleword of V1 is multiplied by 0x1
+	 * and is then XORed with rightmost product. Implicitly, the intermediate
+	 * leftmost product becomes padded
+	 */
+	v1 = (uv2di) vec_gfmsum_128(v0, v1);
+
+	/*
+	 * Now do the final 32-bit fold by multiplying the rightmost word in V1
+	 * with R5 and XOR the result with the remaining bits in V1.
+	 *
+	 * To achieve this by a single VGFMAG, right shift V1 by a word and store
+	 * the result in V2 which is then accumulated.  Use the vector unpack
+	 * instruction to load the rightmost half of the doubleword into the
+	 * rightmost doubleword element of V1; the other half is loaded in the
+	 * leftmost doubleword. The vector register with CONST_R5 contains the R5
+	 * constant in the rightmost doubleword and the leftmost doubleword is
+	 * zero to ignore the leftmost product of V1.
+	 */
+	v9 = vec_insert((unsigned char) 0x20, v9, 7);
+	v2 = vec_srb(v1, (uv2di) v9);
+	v1 = vec_unpackl((uv4si) v1);	/* Split rightmost doubleword */
+	v1 = (uv2di) vec_gfmsum_accum_128(r5, v1, (uv16qi) v2);
+
+	/*----------
+	 * Apply a Barret reduction to compute the final 32-bit CRC value.
+	 *
+	 * The input values to the Barret reduction are the degree-63 polynomial
+	 * in V1 (R(x)), degree-32 generator polynomial, and the reduction
+	 * constant u.  The Barret reduction result is the CRC value of R(x) mod
+	 * P(x).
+	 *
+	 * The Barret reduction algorithm is defined as:
+	 *
+	 *    1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
+	 *    2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
+	 *    3. C(x)  = R(x) XOR T2(x) mod x^32
+	 *
+	 *  Note: The leftmost doubleword of vector register containing
+	 *  CONST_RU_POLY is zero and, thus, the intermediate GF(2) product
+	 *  is zero and does not contribute to the final result.
+	 */
+
+	/* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
+	v2 = vec_unpackl((uv4si) v1);
+	v2 = (uv2di) vec_gfmsum_128(ru_poly, v2);
+
+	/*
+	 * Compute the GF(2) product of the CRC polynomial with T1(x) in V2 and
+	 * XOR the intermediate result, T2(x), with the value in V1. The final
+	 * result is stored in word element 2 of V2.
+	 */
+	v2 = vec_unpackl((uv4si) v2);
+	v2 = (uv2di) vec_gfmsum_accum_128(crc_poly, v2, (uv16qi) v1);
+
+	return ((uv4si) v2)[2];
+}
+
+pg_crc32c
+pg_comp_crc32c_s390x(pg_crc32c crc, const void *data, size_t len)
+{
+	uintptr_t	prealign,
+				aligned,
+				remaining;
+	const unsigned char *buf = data;
+
+	/*
+	 * Preprocess initial bytes with sb8 so the hw can start at an aligned
+	 * address
+	 */
+	if ((uintptr_t) buf & VX_ALIGN_MASK)
+	{
+		prealign = VX_ALIGNMENT - ((uintptr_t) buf & VX_ALIGN_MASK);
+		len -= prealign;
+		crc = pg_comp_crc32c_sb8(crc, buf, prealign);
+		buf += prealign;
+	}
+	aligned = len & ~VX_ALIGN_MASK;
+	remaining = len & VX_ALIGN_MASK;
+
+	/* Process major part of the data with hw acceleration */
+	if (aligned)
+	{
+		crc = pg_bswap32(crc32_le_vgfm_16(pg_bswap32(crc), buf, (size_t) aligned));
+	}
+
+	/* Process remaining bytes that could not be handled by hw */
+	if (remaining)
+	{
+		crc = pg_comp_crc32c_sb8(crc, buf + aligned, remaining);
+	}
+
+	return crc;
+}
diff --git a/src/port/pg_crc32c_s390x_choose.c b/src/port/pg_crc32c_s390x_choose.c
new file mode 100644
index 00000000000..124f0ce71c3
--- /dev/null
+++ b/src/port/pg_crc32c_s390x_choose.c
@@ -0,0 +1,43 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_crc32c_s390x_choose.c
+ *	  Choose between S390X vectorized CRC-32C and software CRC-32C
+ *	  implementation.
+ *
+ * On first call, checks if the CPU we're running on supports the
+ * S390X_VX Extension. If it does, use the special instructions for
+ * CRC-32C computation. Otherwise, fall back to the pure software
+ * implementation (slicing-by-8).
+ *
+ * Copyright (c) 2025, International Business Machines (IBM)
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_crc32c_s390x_choose.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+#include "port/pg_crc32c.h"
+
+#include <sys/auxv.h>
+
+/*
+ * This gets called on the first call. It replaces the function pointer
+ * so that subsequent calls are routed directly to the chosen implementation.
+ */
+static pg_crc32c
+pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
+{
+	/* default call sb8 */
+	pg_comp_crc32c = pg_comp_crc32c_sb8;
+
+	if (getauxval(AT_HWCAP) & HWCAP_S390_VX)
+	{
+		pg_comp_crc32c = pg_comp_crc32c_s390x;
+	}
+
+	return pg_comp_crc32c(crc, data, len);
+}
+
+pg_crc32c	(*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose;
-- 
2.49.0

[V2] Adding new CRC32C implementation for IBM S390X

Reply via email to