From 6751e8a6114ce5ca9920c4e18ec2d2a48278bdde Mon Sep 17 00:00:00 2001
From: Paul Amonson <paul.d.amonson@intel.com>
Date: Fri, 9 Aug 2024 08:00:09 -0700
Subject: [PATCH] [Feat] Add support for the SIMD AVX-512 crc32c algorithm.

Signed-off-by: Paul Amonson <paul.d.amonson@intel.com>
---
 config/c-compiler.m4                |  48 ++++++
 configure                           | 213 ++++++++++++++++++++-----
 configure.ac                        | 106 +++++++-----
 meson.build                         |  40 ++++-
 src/include/pg_config.h.in          |   3 +
 src/include/port/pg_crc32c.h        |  23 +++
 src/include/port/pg_hw_feat_check.h |   9 +-
 src/port/Makefile                   |   5 +
 src/port/meson.build                |   4 +
 src/port/pg_crc32c_avx512.c         | 239 ++++++++++++++++++++++++++++
 src/port/pg_crc32c_avx512_choose.c  |  42 +++++
 src/port/pg_hw_feat_check.c         |  71 ++++++++-
 12 files changed, 717 insertions(+), 86 deletions(-)
 create mode 100644 src/port/pg_crc32c_avx512.c
 create mode 100644 src/port/pg_crc32c_avx512_choose.c

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 10f8c7bd0a..1d33932cb5 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -628,6 +628,54 @@ fi
 undefine([Ac_cachevar])dnl
 ])# PGAC_SSE42_CRC32_INTRINSICS
 
+# PGAC_AVX512_CRC32_INTRINSICS
+# ---------------------------
+# Check if the compiler supports the x86 CRC instructions added in AVX-512,
+# using the intrinsic functions:
+
+# (We don't test the 8-byte variant, _mm_crc32_u64, but it is assumed to
+# be present if the other ones are, on x86-64 platforms)
+#
+# An optional compiler flag can be passed as arguments (e.g. -msse4.2
+# -mavx512vl -mvpclmulqdq). If the intrinsics are supported, sets
+# pgac_avx512_crc32_intrinsics, and CFLAGS_CRC.
+AC_DEFUN([PGAC_AVX512_CRC32_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_crc32_intrinsics_$1])])dnl
+AC_CACHE_CHECK([for _mm512_clmulepi64_epi128, _mm512_clmulepi64_epi128... with CFLAGS=$1], [Ac_cachevar],
+[pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS $1"
+AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>],
+  [const unsigned long k1k2[[8]] = {
+  0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86,
+  0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86};
+  unsigned char buffer[[512]];
+  unsigned char *aligned = (unsigned char*)(((size_t)buffer + 64L) & 0xffffffffffc0L);
+  unsigned long val;
+  __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
+  __m128i a1, a2;
+  unsigned int crc = 0xffffffff;
+  y8 = _mm512_load_si512((__m512i *)aligned);
+  x0 = _mm512_loadu_si512((__m512i *)k1k2);
+  x1 = _mm512_loadu_si512((__m512i *)(buffer + 0x00));
+  x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
+  x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+  x1 = _mm512_ternarylogic_epi64(x1, x5, y5, 0x96);
+  a1 = _mm512_extracti32x4_epi32(x1, 3);
+  a1 = _mm_xor_epi64(a1, _mm512_castsi512_si128(x0));
+  x0 = _mm512_shuffle_i64x2(x1, x1, 0x4E);
+  val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0));
+  crc = (unsigned int)_mm_crc32_u64(val, _mm_extract_epi64(a1, 1));
+  return crc != 0;])],
+  [Ac_cachevar=yes],
+  [Ac_cachevar=no])
+CFLAGS="$pgac_save_CFLAGS"])
+if test x"$Ac_cachevar" = x"yes"; then
+  CFLAGS_CRC="$1"
+  pgac_avx512_crc32_intrinsics=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_AVX512_CRC32_INTRINSICS
+
 
 # PGAC_ARMV8_CRC32C_INTRINSICS
 # ----------------------------
diff --git a/configure b/configure
index 5be6fb4d5f..fca02db11d 100755
--- a/configure
+++ b/configure
@@ -17767,6 +17767,123 @@ fi
 
 fi
 
+# Check for Intel AVX-512 intrinsics to do CRC calculations.
+#
+# First check if the _mm512_clmulepi64_epi128 and more intrinsics can
+# be used with the default compiler flags. If not, check if adding
+# the -msse4.2, -mavx512vl and -mvpclmulqdqif flag helps. CFLAGS_CRC
+# is set to -msse4.2, -mavx512vl and -mvpclmulqdqif that's required.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_clmulepi64_epi128, _mm512_clmulepi64_epi128... with CFLAGS=" >&5
+$as_echo_n "checking for _mm512_clmulepi64_epi128, _mm512_clmulepi64_epi128... with CFLAGS=... " >&6; }
+if ${pgac_cv_avx512_crc32_intrinsics_+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS "
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <immintrin.h>
+int
+main ()
+{
+const unsigned long k1k2[8] = {
+  0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86,
+  0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86};
+  unsigned char buffer[512];
+  unsigned char *aligned = (unsigned char*)(((size_t)buffer + 64L) & 0xffffffffffc0L);
+  unsigned long val;
+  __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
+  __m128i a1, a2;
+  unsigned int crc = 0xffffffff;
+  y8 = _mm512_load_si512((__m512i *)aligned);
+  x0 = _mm512_loadu_si512((__m512i *)k1k2);
+  x1 = _mm512_loadu_si512((__m512i *)(buffer + 0x00));
+  x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
+  x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+  x1 = _mm512_ternarylogic_epi64(x1, x5, y5, 0x96);
+  a1 = _mm512_extracti32x4_epi32(x1, 3);
+  a1 = _mm_xor_epi64(a1, _mm512_castsi512_si128(x0));
+  x0 = _mm512_shuffle_i64x2(x1, x1, 0x4E);
+  val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0));
+  crc = (unsigned int)_mm_crc32_u64(val, _mm_extract_epi64(a1, 1));
+  return crc != 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_avx512_crc32_intrinsics_=yes
+else
+  pgac_cv_avx512_crc32_intrinsics_=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_crc32_intrinsics_" >&5
+$as_echo "$pgac_cv_avx512_crc32_intrinsics_" >&6; }
+if test x"$pgac_cv_avx512_crc32_intrinsics_" = x"yes"; then
+  CFLAGS_CRC=""
+  pgac_avx512_crc32_intrinsics=yes
+fi
+
+if test x"$pgac_avx512_crc32_intrinsics" != x"yes"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_clmulepi64_epi128, _mm512_clmulepi64_epi128... with CFLAGS=-msse4.2 -mavx512vl -mvpclmulqdq" >&5
+$as_echo_n "checking for _mm512_clmulepi64_epi128, _mm512_clmulepi64_epi128... with CFLAGS=-msse4.2 -mavx512vl -mvpclmulqdq... " >&6; }
+if ${pgac_cv_avx512_crc32_intrinsics__msse4_2__mavx512vl__mvpclmulqdq+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS -msse4.2 -mavx512vl -mvpclmulqdq"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <immintrin.h>
+int
+main ()
+{
+const unsigned long k1k2[8] = {
+  0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86,
+  0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86};
+  unsigned char buffer[512];
+  unsigned char *aligned = (unsigned char*)(((size_t)buffer + 64L) & 0xffffffffffc0L);
+  unsigned long val;
+  __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
+  __m128i a1, a2;
+  unsigned int crc = 0xffffffff;
+  y8 = _mm512_load_si512((__m512i *)aligned);
+  x0 = _mm512_loadu_si512((__m512i *)k1k2);
+  x1 = _mm512_loadu_si512((__m512i *)(buffer + 0x00));
+  x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
+  x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+  x1 = _mm512_ternarylogic_epi64(x1, x5, y5, 0x96);
+  a1 = _mm512_extracti32x4_epi32(x1, 3);
+  a1 = _mm_xor_epi64(a1, _mm512_castsi512_si128(x0));
+  x0 = _mm512_shuffle_i64x2(x1, x1, 0x4E);
+  val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0));
+  crc = (unsigned int)_mm_crc32_u64(val, _mm_extract_epi64(a1, 1));
+  return crc != 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_avx512_crc32_intrinsics__msse4_2__mavx512vl__mvpclmulqdq=yes
+else
+  pgac_cv_avx512_crc32_intrinsics__msse4_2__mavx512vl__mvpclmulqdq=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_crc32_intrinsics__msse4_2__mavx512vl__mvpclmulqdq" >&5
+$as_echo "$pgac_cv_avx512_crc32_intrinsics__msse4_2__mavx512vl__mvpclmulqdq" >&6; }
+if test x"$pgac_cv_avx512_crc32_intrinsics__msse4_2__mavx512vl__mvpclmulqdq" = x"yes"; then
+  CFLAGS_CRC="-msse4.2 -mavx512vl -mvpclmulqdq"
+  pgac_avx512_crc32_intrinsics=yes
+fi
+
+fi
+
 # Are we targeting a processor that supports SSE 4.2? gcc, clang and icc all
 # define __SSE4_2__ in that case.
 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
@@ -17939,31 +18056,42 @@ fi
 #
 # If we are targeting a LoongArch processor, CRC instructions are
 # always available (at least on 64 bit), so no runtime check is needed.
-if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then
-  # Use Intel SSE 4.2 if available.
-  if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
-    USE_SSE42_CRC32C=1
+if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_AVX512_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then
+  # Use Intel AVX 512 if available.
+  if test x"$pgac_avx512_crc32_intrinsics" = x"yes" && test x"$AVX512_TARGETED" = x"1" ; then
+    USE_AVX512_CRC32C=1
   else
-    # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
-    # the runtime check.
-    if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
-      USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
+   # Use Intel SSE 4.2 if available.
+    if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+      USE_SSE42_CRC32C=1
     else
-      # Use ARM CRC Extension if available.
-      if test x"$pgac_armv8_crc32c_intrinsics" = x"yes" && test x"$CFLAGS_CRC" = x""; then
-        USE_ARMV8_CRC32C=1
+      # Intel AVX 512, with runtime check? The CPUID instruction is needed for
+      # the runtime check.
+      if test x"$pgac_avx512_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+          USE_AVX512_CRC32C_WITH_RUNTIME_CHECK=1
       else
-        # ARM CRC Extension, with runtime check?
-        if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
-          USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1
+        # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
+        # the runtime check.
+        if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+          USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
         else
-          # LoongArch CRCC instructions.
-          if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
-            USE_LOONGARCH_CRC32C=1
+          # Use ARM CRC Extension if available.
+          if test x"$pgac_armv8_crc32c_intrinsics" = x"yes" && test x"$CFLAGS_CRC" = x""; then
+            USE_ARMV8_CRC32C=1
           else
-            # fall back to slicing-by-8 algorithm, which doesn't require any
-            # special CPU support.
-            USE_SLICING_BY_8_CRC32C=1
+            # ARM CRC Extension, with runtime check?
+            if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
+              USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1
+            else
+              # LoongArch CRCC instructions.
+              if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
+                USE_LOONGARCH_CRC32C=1
+              else
+                # fall back to slicing-by-8 algorithm, which doesn't require any
+                # special CPU support.
+                USE_SLICING_BY_8_CRC32C=1
+              fi
+            fi
           fi
         fi
       fi
@@ -17982,44 +18110,53 @@ $as_echo "#define USE_SSE42_CRC32C 1" >>confdefs.h
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5
 $as_echo "SSE 4.2" >&6; }
 else
-  if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
+  if test x"$USE_AVX512_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
+
+$as_echo "#define USE_AVX512_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+    PG_CRC32C_OBJS="pg_crc32c_avx512.o pg_crc32c_sb8.o pg_crc32c_avx512_choose.o"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: AVX 512 with runtime check" >&5
+$as_echo "AVX 512 with runtime check" >&6; }
+  else
+    if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
 
 $as_echo "#define USE_SSE42_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
 
-    PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_crc32c_sse42_choose.o"
-    { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5
+      PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_crc32c_sse42_choose.o"
+      { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5
 $as_echo "SSE 4.2 with runtime check" >&6; }
-  else
-    if test x"$USE_ARMV8_CRC32C" = x"1"; then
+    else
+      if test x"$USE_ARMV8_CRC32C" = x"1"; then
 
 $as_echo "#define USE_ARMV8_CRC32C 1" >>confdefs.h
 
-      PG_CRC32C_OBJS="pg_crc32c_armv8.o"
-      { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions" >&5
+        PG_CRC32C_OBJS="pg_crc32c_armv8.o"
+        { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions" >&5
 $as_echo "ARMv8 CRC instructions" >&6; }
-    else
-      if test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
+      else
+        if test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
 
 $as_echo "#define USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
 
-        PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o"
-        { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions with runtime check" >&5
+          PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o"
+          { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions with runtime check" >&5
 $as_echo "ARMv8 CRC instructions with runtime check" >&6; }
-      else
-        if test x"$USE_LOONGARCH_CRC32C" = x"1"; then
+        else
+          if test x"$USE_LOONGARCH_CRC32C" = x"1"; then
 
 $as_echo "#define USE_LOONGARCH_CRC32C 1" >>confdefs.h
 
-          PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
-          { $as_echo "$as_me:${as_lineno-$LINENO}: result: LoongArch CRCC instructions" >&5
+            PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
+            { $as_echo "$as_me:${as_lineno-$LINENO}: result: LoongArch CRCC instructions" >&5
 $as_echo "LoongArch CRCC instructions" >&6; }
-        else
+          else
 
 $as_echo "#define USE_SLICING_BY_8_CRC32C 1" >>confdefs.h
 
-          PG_CRC32C_OBJS="pg_crc32c_sb8.o"
-          { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5
+            PG_CRC32C_OBJS="pg_crc32c_sb8.o"
+            { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5
 $as_echo "slicing-by-8" >&6; }
+          fi
         fi
       fi
     fi
diff --git a/configure.ac b/configure.ac
index 2e64f53898..ce68dce9d2 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2107,6 +2107,17 @@ if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
   PGAC_SSE42_CRC32_INTRINSICS([-msse4.2])
 fi
 
+# Check for Intel AVX-512 intrinsics to do CRC calculations.
+#
+# First check if the _mm512_clmulepi64_epi128 and more intrinsics can
+# be used with the default compiler flags. If not, check if adding
+# the -msse4.2, -mavx512vl and -mvpclmulqdqif flag helps. CFLAGS_CRC
+# is set to -msse4.2, -mavx512vl and -mvpclmulqdqif that's required.
+PGAC_AVX512_CRC32_INTRINSICS([])
+if test x"$pgac_avx512_crc32_intrinsics" != x"yes"; then
+  PGAC_AVX512_CRC32_INTRINSICS([-msse4.2 -mavx512vl -mvpclmulqdq])
+fi
+
 # Are we targeting a processor that supports SSE 4.2? gcc, clang and icc all
 # define __SSE4_2__ in that case.
 AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [
@@ -2152,31 +2163,42 @@ AC_SUBST(CFLAGS_CRC)
 #
 # If we are targeting a LoongArch processor, CRC instructions are
 # always available (at least on 64 bit), so no runtime check is needed.
-if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then
-  # Use Intel SSE 4.2 if available.
-  if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
-    USE_SSE42_CRC32C=1
+if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_AVX512_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then
+  # Use Intel AVX 512 if available.
+  if test x"$pgac_avx512_crc32_intrinsics" = x"yes" && test x"$AVX512_TARGETED" = x"1" ; then
+    USE_AVX512_CRC32C=1
   else
-    # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
-    # the runtime check.
-    if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
-      USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
+   # Use Intel SSE 4.2 if available.
+    if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+      USE_SSE42_CRC32C=1
     else
-      # Use ARM CRC Extension if available.
-      if test x"$pgac_armv8_crc32c_intrinsics" = x"yes" && test x"$CFLAGS_CRC" = x""; then
-        USE_ARMV8_CRC32C=1
+      # Intel AVX 512, with runtime check? The CPUID instruction is needed for
+      # the runtime check.
+      if test x"$pgac_avx512_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+          USE_AVX512_CRC32C_WITH_RUNTIME_CHECK=1
       else
-        # ARM CRC Extension, with runtime check?
-        if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
-          USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1
+        # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
+        # the runtime check.
+        if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+          USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
         else
-          # LoongArch CRCC instructions.
-          if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
-            USE_LOONGARCH_CRC32C=1
+          # Use ARM CRC Extension if available.
+          if test x"$pgac_armv8_crc32c_intrinsics" = x"yes" && test x"$CFLAGS_CRC" = x""; then
+            USE_ARMV8_CRC32C=1
           else
-            # fall back to slicing-by-8 algorithm, which doesn't require any
-            # special CPU support.
-            USE_SLICING_BY_8_CRC32C=1
+            # ARM CRC Extension, with runtime check?
+            if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
+              USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1
+            else
+              # LoongArch CRCC instructions.
+              if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
+                USE_LOONGARCH_CRC32C=1
+              else
+                # fall back to slicing-by-8 algorithm, which doesn't require any
+                # special CPU support.
+                USE_SLICING_BY_8_CRC32C=1
+              fi
+            fi
           fi
         fi
       fi
@@ -2191,29 +2213,35 @@ if test x"$USE_SSE42_CRC32C" = x"1"; then
   PG_CRC32C_OBJS="pg_crc32c_sse42.o"
   AC_MSG_RESULT(SSE 4.2)
 else
-  if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
-    AC_DEFINE(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check.])
-    PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_crc32c_sse42_choose.o"
-    AC_MSG_RESULT(SSE 4.2 with runtime check)
+  if test x"$USE_AVX512_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
+    AC_DEFINE(USE_AVX512_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel AVX 512 CRC instructions with a runtime check.])
+    PG_CRC32C_OBJS="pg_crc32c_avx512.o pg_crc32c_sb8.o pg_crc32c_avx512_choose.o"
+    AC_MSG_RESULT(AVX 512 with runtime check)
   else
-    if test x"$USE_ARMV8_CRC32C" = x"1"; then
-      AC_DEFINE(USE_ARMV8_CRC32C, 1, [Define to 1 to use ARMv8 CRC Extension.])
-      PG_CRC32C_OBJS="pg_crc32c_armv8.o"
-      AC_MSG_RESULT(ARMv8 CRC instructions)
+    if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
+      AC_DEFINE(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check.])
+      PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_crc32c_sse42_choose.o"
+      AC_MSG_RESULT(SSE 4.2 with runtime check)
     else
-      if test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
-        AC_DEFINE(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use ARMv8 CRC Extension with a runtime check.])
-        PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o"
-        AC_MSG_RESULT(ARMv8 CRC instructions with runtime check)
+      if test x"$USE_ARMV8_CRC32C" = x"1"; then
+        AC_DEFINE(USE_ARMV8_CRC32C, 1, [Define to 1 to use ARMv8 CRC Extension.])
+        PG_CRC32C_OBJS="pg_crc32c_armv8.o"
+        AC_MSG_RESULT(ARMv8 CRC instructions)
       else
-        if test x"$USE_LOONGARCH_CRC32C" = x"1"; then
-          AC_DEFINE(USE_LOONGARCH_CRC32C, 1, [Define to 1 to use LoongArch CRCC instructions.])
-          PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
-          AC_MSG_RESULT(LoongArch CRCC instructions)
+        if test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
+          AC_DEFINE(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use ARMv8 CRC Extension with a runtime check.])
+          PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o"
+          AC_MSG_RESULT(ARMv8 CRC instructions with runtime check)
         else
-          AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).])
-          PG_CRC32C_OBJS="pg_crc32c_sb8.o"
-          AC_MSG_RESULT(slicing-by-8)
+          if test x"$USE_LOONGARCH_CRC32C" = x"1"; then
+            AC_DEFINE(USE_LOONGARCH_CRC32C, 1, [Define to 1 to use LoongArch CRCC instructions.])
+            PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
+            AC_MSG_RESULT(LoongArch CRCC instructions)
+          else
+            AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).])
+            PG_CRC32C_OBJS="pg_crc32c_sb8.o"
+            AC_MSG_RESULT(slicing-by-8)
+          fi
         fi
       fi
     fi
diff --git a/meson.build b/meson.build
index cd711c6d01..1ddd1bed40 100644
--- a/meson.build
+++ b/meson.build
@@ -2245,6 +2245,34 @@ if host_cpu == 'x86' or host_cpu == 'x86_64'
     cdata.set('USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 1)
     have_optimized_crc = true
   else
+    avx_prog = '''
+#include <immintrin.h>
+
+int main(void)
+{
+  const unsigned long k1k2[8] = {
+  0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86,
+  0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86};
+  unsigned char buffer[512];
+  unsigned char *aligned = (unsigned char*)(((size_t)buffer + 64L) & 0xffffffffffc0L);
+  unsigned long val;
+  __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
+  __m128i a1, a2;
+  unsigned int crc = 0xffffffff;
+  y8 = _mm512_load_si512((__m512i *)aligned);
+  x0 = _mm512_loadu_si512((__m512i *)k1k2);
+  x1 = _mm512_loadu_si512((__m512i *)(buffer + 0x00));
+  x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
+  x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+  x1 = _mm512_ternarylogic_epi64(x1, x5, y5, 0x96);
+  a1 = _mm512_extracti32x4_epi32(x1, 3);
+  a1 = _mm_xor_epi64(a1, _mm512_castsi512_si128(x0));
+  x0 = _mm512_shuffle_i64x2(x1, x1, 0x4E);
+  val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0));
+  crc = (unsigned int)_mm_crc32_u64(val, _mm_extract_epi64(a1, 1));
+  return crc != 0;
+}
+'''
 
     prog = '''
 #include <nmmintrin.h>
@@ -2259,12 +2287,20 @@ int main(void)
 }
 '''
 
-    if cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32 without -msse4.2',
+    if cc.links(avx_prog,
+          name: '_mm512_clmulepi64_epi128 ... with -msse4.2 -mavx512vl -mvpclmulqdq',
+          args: test_c_args + ['-msse4.2', '-mavx512vl', '-mvpclmulqdq'])
+      cflags_crc += ['-msse4.2','-mavx512vl','-mvpclmulqdq']
+      cdata.set('USE_AVX512_CRC32C', false)
+      cdata.set('USE_AVX512_CRC32C_WITH_RUNTIME_CHECK', 1)
+      have_optimized_crc = true
+    endif
+    if have_optimized_crc == false and cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32 without -msse4.2',
           args: test_c_args)
       # Use Intel SSE 4.2 unconditionally.
       cdata.set('USE_SSE42_CRC32C', 1)
       have_optimized_crc = true
-    elif cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32 with -msse4.2',
+    elif have_optimized_crc == false and cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32 with -msse4.2',
           args: test_c_args + ['-msse4.2'])
       # Use Intel SSE 4.2, with runtime check. The CPUID instruction is needed for
       # the runtime check.
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 979925cc2e..ea797f13f3 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -739,6 +739,9 @@
 /* Define to 1 use Intel SSE 4.2 CRC instructions. */
 #undef USE_SSE42_CRC32C
 
+/* Define to 1 to use Intel AVX 512 CRC instructions with a runtime check. */
+#undef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
+
 /* Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check. */
 #undef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
 
diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h
index 63c8e3a00b..ade06dbcab 100644
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -49,6 +49,14 @@ typedef uint32 pg_crc32c;
 
 extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
 
+#elif defined (USE_AVX512_CRC32)
+/* Use Intel AVX512 instructions. */
+#define COMP_CRC32C(crc, data, len) \
+	((crc) = pg_comp_crc32c_avx512((crc), (data), (len)))
+#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
+
+extern pg_crc32c pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t len);
+
 #elif defined(USE_ARMV8_CRC32C)
 /* Use ARMv8 CRC Extension instructions. */
 
@@ -67,6 +75,21 @@ extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t le
 
 extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len);
 
+#elif defined(USE_AVX512_CRC32C_WITH_RUNTIME_CHECK)
+
+/*
+ * Use Intel AVX-512 instructions, but perform a runtime check first to check that
+ * they are available.
+ */
+#define COMP_CRC32C(crc, data, len) \
+	((crc) = pg_comp_crc32c((crc), (data), (len)))
+#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
+
+extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
+extern pg_crc32c (*pg_comp_crc32c)(pg_crc32c crc, const void *data, size_t len);
+
+extern pg_crc32c pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t len);
+
 #elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
 
 /*
diff --git a/src/include/port/pg_hw_feat_check.h b/src/include/port/pg_hw_feat_check.h
index 58be900b54..21ee8615e1 100644
--- a/src/include/port/pg_hw_feat_check.h
+++ b/src/include/port/pg_hw_feat_check.h
@@ -30,4 +30,11 @@ extern PGDLLIMPORT bool pg_popcount_available(void);
  * available.
  */
 extern PGDLLIMPORT bool pg_popcount_avx512_available(void);
-#endif							/* PG_HW_FEAT_CHECK_H */
+
+/*
+ * Test to see if all hardware features required by the AVX-512 SIMD
+ * algorithm are available.
+ */
+extern bool pg_crc32c_avx512_available(void);
+
+#endif						/* PG_HW_FEAT_CHECK_H */
diff --git a/src/port/Makefile b/src/port/Makefile
index b18710eeef..35445d88f1 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -89,6 +89,11 @@ pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_sse42_srv.o: CFLAGS+=$(CFLAGS_CRC)
 
+# all versions of pg_crc32c_avx512.o need CFLAGS_CRC
+pg_crc32c_avx512.o: CFLAGS+=$(CFLAGS_CRC)
+pg_crc32c_avx512_shlib.o: CFLAGS+=$(CFLAGS_CRC)
+pg_crc32c_avx512_srv.o: CFLAGS+=$(CFLAGS_CRC)
+
 # all versions of pg_crc32c_armv8.o need CFLAGS_CRC
 pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC)
diff --git a/src/port/meson.build b/src/port/meson.build
index f8cafc4bd4..31d50a7a3b 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -84,6 +84,10 @@ replace_funcs_pos = [
   ['pg_crc32c_sse42', 'USE_SSE42_CRC32C'],
   ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
   ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
+  ['pg_crc32c_avx512', 'USE_AVX512_CRC32C'],
+  ['pg_crc32c_avx512', 'USE_AVX512_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
+  ['pg_crc32c_avx512_choose', 'USE_AVX512_CRC32C_WITH_RUNTIME_CHECK'],
+  ['pg_crc32c_sb8', 'USE_AVX512_CRC32C_WITH_RUNTIME_CHECK'],
   ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
   ['pg_popcount_avx512', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 'popcnt'],
   ['pg_hw_feat_check', 'HAVE_XSAVE_INTRINSICS', 'xsave'],
diff --git a/src/port/pg_crc32c_avx512.c b/src/port/pg_crc32c_avx512.c
new file mode 100644
index 0000000000..be42a34a73
--- /dev/null
+++ b/src/port/pg_crc32c_avx512.c
@@ -0,0 +1,239 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_crc32c_avx512.c
+ *	  Compute CRC-32C checksum using Intel AVX-512 instructions.
+ *
+ * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_crc32c_avx512.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include <immintrin.h>
+
+#include "port/pg_crc32c.h"
+
+/*
+ * Process eight bytes of data at a time.
+ *
+ * NB: We do unaligned accesses here. The Intel architecture allows that,
+ * and performance testing didn't show any performance gain from aligning
+ * the begin address.
+ */
+pg_attribute_no_sanitize_alignment()
+inline static pg_crc32c
+crc32c_fallback(pg_crc32c crc, const uint8 *p, size_t length)
+{
+	const unsigned char *pend = p + length;
+
+	/*
+	 * Process eight bytes of data at a time.
+	 *
+	 * NB: We do unaligned accesses here. The Intel architecture allows that,
+	 * and performance testing didn't show any performance gain from aligning
+	 * the begin address.
+	 */
+	while (p + 8 <= pend)
+	{
+		crc = (uint32)_mm_crc32_u64(crc, *((const uint64 *)p));
+		p += 8;
+	}
+
+	/* Process remaining full four bytes if any */
+	if (p + 4 <= pend)
+	{
+		crc = _mm_crc32_u32(crc, *((const unsigned int *)p));
+		p += 4;
+	}
+
+	/* Process any remaining bytes one at a time. */
+	while (p < pend)
+	{
+		crc = _mm_crc32_u8(crc, *p);
+		p++;
+	}
+
+	return crc;
+}
+
+/*******************************************************************
+ * pg_crc32c_avx512(): compute the crc32c of the buffer, where the
+ * buffer length must be at least 256, and a multiple of 64. Based
+ * on:
+ *
+ * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ
+ * Instruction"
+ *  V. Gopal, E. Ozturk, et al., 2009
+ *
+ * For This Function:
+ * Copyright 2015 The Chromium Authors
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *    * Neither the name of Google LLC nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+pg_attribute_no_sanitize_alignment()
+inline pg_crc32c
+pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t length)
+{
+	static const uint64 k1k2[8] = {
+		0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4,
+		0xb9e02b86, 0xdcb17aa4, 0xb9e02b86};
+	static const uint64 k3k4[8] = {
+		0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8, 0x740eef02,
+		0x9e4addf8, 0x740eef02, 0x9e4addf8};
+	static const uint64 k9k10[8] = {
+		0x6992cea2, 0x0d3b6092, 0x6992cea2, 0x0d3b6092, 0x6992cea2,
+		0x0d3b6092, 0x6992cea2, 0x0d3b6092};
+	static const uint64 k1k4[8] = {
+		0x1c291d04, 0xddc0152b, 0x3da6d0cb, 0xba4fc28e, 0xf20c0dfe,
+		0x493c7d27, 0x00000000, 0x00000000};
+
+	const uint8 *input = (const uint8 *)data;
+	if (length >= 256)
+	{
+		uint64 val;
+		__m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
+		__m128i a1, a2;
+
+		/*
+		 * AVX-512 Optimized crc32c algorithm with mimimum of 256 bytes aligned
+		 * to 32 bytes.
+		 * >>> BEGIN
+		 */
+
+		/*
+		* There's at least one block of 256.
+		*/
+		x1 = _mm512_loadu_si512((__m512i *)(input + 0x00));
+		x2 = _mm512_loadu_si512((__m512i *)(input + 0x40));
+		x3 = _mm512_loadu_si512((__m512i *)(input + 0x80));
+		x4 = _mm512_loadu_si512((__m512i *)(input + 0xC0));
+
+		x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
+
+		x0 = _mm512_load_si512((__m512i *)k1k2);
+
+		input += 256;
+		length -= 256;
+
+		/*
+		* Parallel fold blocks of 256, if any.
+		*/
+		while (length >= 256)
+		{
+			x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+			x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
+			x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
+			x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);
+
+			x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+			x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
+			x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
+			x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);
+
+			y5 = _mm512_loadu_si512((__m512i *)(input + 0x00));
+			y6 = _mm512_loadu_si512((__m512i *)(input + 0x40));
+			y7 = _mm512_loadu_si512((__m512i *)(input + 0x80));
+			y8 = _mm512_loadu_si512((__m512i *)(input + 0xC0));
+
+			x1 = _mm512_ternarylogic_epi64(x1, x5, y5, 0x96);
+			x2 = _mm512_ternarylogic_epi64(x2, x6, y6, 0x96);
+			x3 = _mm512_ternarylogic_epi64(x3, x7, y7, 0x96);
+			x4 = _mm512_ternarylogic_epi64(x4, x8, y8, 0x96);
+
+			input += 256;
+			length -= 256;
+				}
+
+		/*
+		 * Fold 256 bytes into 64 bytes.
+		 */
+		x0 = _mm512_load_si512((__m512i *)k9k10);
+		x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+		x6 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+		x3 = _mm512_ternarylogic_epi64(x3, x5, x6, 0x96);
+
+		x7 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
+		x8 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
+		x4 = _mm512_ternarylogic_epi64(x4, x7, x8, 0x96);
+
+		x0 = _mm512_load_si512((__m512i *)k3k4);
+		y5 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
+		y6 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
+		x1 = _mm512_ternarylogic_epi64(x4, y5, y6, 0x96);
+
+		/*
+		 * Single fold blocks of 64, if any.
+		 */
+		while (length >= 64)
+		{
+			x2 = _mm512_loadu_si512((__m512i *)input);
+
+			x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+			x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+			x1 = _mm512_ternarylogic_epi64(x1, x2, x5, 0x96);
+
+			input += 64;
+			length -= 64;
+		}
+
+		/*
+		 * Fold 512-bits to 128-bits.
+		 */
+		x0 = _mm512_loadu_si512((__m512i *)k1k4);
+
+		a2 = _mm512_extracti32x4_epi32(x1, 3);
+		x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+		x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+		x1 = _mm512_ternarylogic_epi64(x1, x5, _mm512_castsi128_si512(a2), 0x96);
+
+		x0 = _mm512_shuffle_i64x2(x1, x1, 0x4E);
+		x0 = _mm512_xor_epi64(x1, x0);
+		a1 = _mm512_extracti32x4_epi32(x0, 1);
+		a1 = _mm_xor_epi64(a1, _mm512_castsi512_si128(x0));
+
+		/*
+		 * Fold 128-bits to 32-bits.
+		 */
+		val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0));
+		crc = (uint32_t)_mm_crc32_u64(val, _mm_extract_epi64(a1, 1));
+		/*
+		 * AVX-512 Optimized crc32c algorithm with mimimum of 256 bytes aligned
+		 * to 32 bytes.
+		 * <<< END
+		 ******************************************************************/
+	}
+
+	/*
+	 * Finish any remaining bytes.
+	 */
+	return crc32c_fallback(crc, input, length);
+}
diff --git a/src/port/pg_crc32c_avx512_choose.c b/src/port/pg_crc32c_avx512_choose.c
new file mode 100644
index 0000000000..4f11c278be
--- /dev/null
+++ b/src/port/pg_crc32c_avx512_choose.c
@@ -0,0 +1,42 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_crc32c_avx512_choose.c
+ *	  Choose between Intel AVX-512 and software CRC-32C implementation.
+ *
+ * On first call, checks if the CPU we're running on supports Intel AVX-
+ * 512. If it does, use the special AVX-512 instructions for CRC-32C
+ * computation. Otherwise, fall back to the pure software implementation
+ * (slicing-by-8).
+ *
+ * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_crc32c_avx512_choose.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include "port/pg_crc32c.h"
+#include "port/pg_hw_feat_check.h"
+
+
+/*
+ * This gets called on the first call. It replaces the function pointer
+ * so that subsequent calls are routed directly to the chosen implementation.
+ */
+static pg_crc32c
+pg_comp_avx512_choose(pg_crc32c crc, const void *data, size_t len)
+{
+	if (pg_crc32c_avx512_available())
+		pg_comp_crc32c = pg_comp_crc32c_avx512;
+	else
+		pg_comp_crc32c = pg_comp_crc32c_sb8;
+
+	return pg_comp_crc32c(crc, data, len);
+}
+
+pg_crc32c	(*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_avx512_choose;
diff --git a/src/port/pg_hw_feat_check.c b/src/port/pg_hw_feat_check.c
index 455005add5..35d6f9cdb1 100644
--- a/src/port/pg_hw_feat_check.c
+++ b/src/port/pg_hw_feat_check.c
@@ -132,9 +132,60 @@ bool PGDLLIMPORT pg_popcount_available(void)
 	return is_bit_set_in_exx(exx, ECX, 23);
  }
 
+/*
+ * Check for CPU supprt for CPUIDEX: avx512-f
+ */
+inline static bool
+avx512f_available(void)
+{
+	exx_t exx[4] = {0, 0, 0, 0};
+
+	pg_getcpuidex(7, 0, exx);
+	return is_bit_set_in_exx(exx, EBX, 16); /* avx512-f */
+}
+
+/*
+ * Check for CPU supprt for CPUIDEX: vpclmulqdq
+ */
+inline static bool
+vpclmulqdq_available(void)
+{
+	exx_t exx[4] = {0, 0, 0, 0};
+
+	pg_getcpuidex(7, 0, exx);
+	return is_bit_set_in_exx(exx, ECX, 10); /* vpclmulqdq */
+}
+
+/*
+ * Check for CPU supprt for CPUIDEX: vpclmulqdq
+ */
+inline static bool
+avx512vl_available(void)
+{
+	exx_t exx[4] = {0, 0, 0, 0};
+
+	pg_getcpuidex(7, 0, exx);
+	return is_bit_set_in_exx(exx, EBX, 31); /* avx512-vl */
+}
+
+/*
+ * Check for CPU supprt for CPUID: sse4.2
+ */
+inline static bool
+sse42_available(void)
+{
+	exx_t exx[4] = {0, 0, 0, 0};
+
+	pg_getcpuid(1, exx);
+	return is_bit_set_in_exx(exx, ECX, 20); /* sse4.2 */
+}
+
+/****************************************************************************/
+/*                               Public API                                 */
+/****************************************************************************/
  /*
-  * Returns true if the CPU supports the instructions required for the AVX-512
-  * pg_popcount() implementation.
+  * Returns true if the CPU supports the instructions required for the
+  * AVX-512 pg_popcount() implementation.
   *
   * PA: The call to 'osxsave_available' MUST preceed the call to
   *     'zmm_regs_available' function per NB above.
@@ -151,9 +202,17 @@ bool PGDLLIMPORT pg_popcount_avx512_available(void)
  */
 bool PGDLLIMPORT pg_crc32c_sse42_available(void)
 {
-	exx_t exx[4] = {0, 0, 0, 0};
-
-	pg_getcpuid(1, exx);
+	return sse42_available();
+}
 
-	return is_bit_set_in_exx(exx, ECX, 20);
+/*
+ * Returns true if the CPU supports the instructions required for the AVX-512
+ * pg_crc32c implementation.
+ */
+inline bool
+pg_crc32c_avx512_available(void)
+{
+	return sse42_available() && osxsave_available() &&
+		   avx512f_available() && vpclmulqdq_available() &&
+		   avx512vl_available() && zmm_regs_available();
 }
-- 
2.34.1

