Re: Improve CRC32C performance on SSE4.2

John Naylor Tue, 25 Mar 2025 06:04:44 -0700

On Mon, Mar 24, 2025 at 6:37 PM John Naylor <johncnaylo...@gmail.com> wrote:
> I'll take a look at the configure
> checks soon, since I had some questions there.


I'm leaning towards a length limit for v15-0001 so that inlined
instructions are likely to be unrolled. Aside from lack of commit
message, I think that one is ready for commit soon-ish.

I'm feeling pretty good about 0002, but since there is still room for
cosmetic fiddling, I want to let it sit for a bit longer.

I felt the previous proposals for configure.ac were unnecessarily
invasive, and the message looked out of place, so I made configure.ac
more similar to master, using the AVX popcount stuff as a model. I
also went the extra step and added a separate AC_MSG_CHECKING for
vectorized CRC. I'm not sure we really need that, but this algorithm
is trivially adoptable to Arm so it might be welcome for visibility.

For Meson, I just made the CRC checking comment a bit more general,
since keeping up this level of detail would result a loss in
readability.

0003 is just to demonstrate on CI that we are in fact computing the
same answer as master. An earlier patch had some additional tests in
strings.sql but I have yet to dig those out.

--
John Naylor
Amazon Web Services

From cebf6a4b6ecec7fdc30678828ad9883149d9378b Mon Sep 17 00:00:00 2001
From: John Naylor <john.nay...@postgresql.org>
Date: Fri, 28 Feb 2025 16:27:30 +0700
Subject: [PATCH v15 1/3] Inline CRC computation for fixed-length input

Use a simplified copy of the loop in pg_crc32c_sse42.c to avoid
moving code to a separate header.
---
 src/include/port/pg_crc32c.h | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h
index 65ebeacf4b1..0ab7513f523 100644
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -43,12 +43,42 @@ typedef uint32 pg_crc32c;
 
 #if defined(USE_SSE42_CRC32C)
 /* Use Intel SSE4.2 instructions. */
+
+#include <nmmintrin.h>
+
 #define COMP_CRC32C(crc, data, len) \
-	((crc) = pg_comp_crc32c_sse42((crc), (data), (len)))
+	((crc) = pg_comp_crc32c_dispatch((crc), (data), (len)))
 #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
 
 extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
 
+pg_attribute_no_sanitize_alignment()
+static inline
+pg_crc32c
+pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len)
+{
+	if (__builtin_constant_p(len) && len < 32)
+	{
+		const unsigned char *p = data;
+
+		/*
+		 * For small constant inputs, inline the computation to avoid a
+		 * function call and allow the compiler to unroll loops.
+		 */
+#if SIZEOF_VOID_P >= 8
+		for (; len >= 8; p += 8, len -= 8)
+			crc = _mm_crc32_u64(crc, *(const uint64 *) p);
+#endif
+		for (; len >= 4; p += 4, len -= 4)
+			crc = _mm_crc32_u32(crc, *(const uint32 *) p);
+		for (; len > 0; --len)
+			crc = _mm_crc32_u8(crc, *p++);
+		return crc;
+	}
+	else
+		return pg_comp_crc32c_sse42(crc, data, len);
+}
+
 #elif defined(USE_ARMV8_CRC32C)
 /* Use ARMv8 CRC Extension instructions. */
 
-- 
2.48.1

From c519bd870fbb719fe147984b1a2aeff81316172c Mon Sep 17 00:00:00 2001
From: John Naylor <john.nay...@postgresql.org>
Date: Tue, 25 Mar 2025 15:19:16 +0700
Subject: [PATCH v15 3/3] Add debug for CI XXX not for commit

---
 src/port/pg_crc32c_sse42.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index f392eb5b236..3b9c448486f 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -19,6 +19,8 @@
 
 #include "port/pg_crc32c.h"
 
+#define DEBUG_CRC				/* XXX not for commit, or at least comment out */
+
 pg_attribute_no_sanitize_alignment()
 pg_attribute_target("sse4.2")
 pg_crc32c
@@ -87,6 +89,9 @@ pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t length)
 	pg_crc32c	crc0 = crc;
 	size_t		len = length;
 	const char *buf = data;
+#ifdef DEBUG_CRC
+	const size_t orig_len PG_USED_FOR_ASSERTS_ONLY = len;
+#endif
 
 	/* Align on cacheline boundary. WIP: The threshold needs testing. */
 	if (unlikely(len > 256))
@@ -139,7 +144,13 @@ pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t length)
 		len = end - buf;
 	}
 
-	return pg_comp_crc32c_sse42(crc0, buf, len);
+	crc0 = pg_comp_crc32c_sse42(crc0, buf, len);
+
+#ifdef DEBUG_CRC
+	Assert(crc0 == pg_comp_crc32c_sse42(crc, data, orig_len));
+#endif
+
+	return crc0;
 }
 
 #endif
-- 
2.48.1

From 1fc0fd60062446307bda6c60619344d8588c8125 Mon Sep 17 00:00:00 2001
From: John Naylor <john.nay...@postgresql.org>
Date: Tue, 25 Mar 2025 19:22:32 +0700
Subject: [PATCH v15 2/3] Improve CRC32C performance on x86_64

The current SSE4.2 implementation of CRC32C relies on the native
CRC32 instruction, which operates on 8 bytes at a time. We can get a
substantial speedup on longer inputs by using carryless multiplication
on SIMD registers, processing 64 bytes per loop iteration.

The VPCLMULQDQ instruction on 512-bit registers has been available
on Intel hardware since 2019 and AMD since 2022. There is an older
variant for 128-bit registers, but at least on Zen 2 it performs
worse than normal CRC instructions for short inputs. (Thanks to
David Rowley for testing on that platform.)

We must now do a runtime check, even for builds that target SSE
4.2. This doesn't matter in practice for WAL (arguably the most
critical case), because with commit XXXYYYZZZ the final computation
with the 20-byte WAL header is inlined. Compared with two function
calls, testing showed equal or slightly faster performance in
performing an indirect function call on several dozen bytes followed
by inlined instructions on constant input of 20 bytes.

The MIT-licensed implementation was generated with the "generate"
program from

https://github.com/corsix/fast-crc32/

Based on: "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ
Instruction" V. Gopal, E. Ozturk, et al., 2009

Author: Raghuveer Devulapalli <raghuveer.devulapa...@intel.com>
Author: Paul Amonson <paul.d.amon...@intel.com>
Reviewed-by: John Naylor <johncnaylo...@gmail.com>
Reviewed-by: Nathan Bossart <nathandboss...@gmail.com>
Reviewed-by: Andres Freund <and...@anarazel.de> (earlier version)
Reviewed-by: Matthew Sterrett <matthewsterre...@gmail.com> (earlier version)
Tested-by: Raghuveer Devulapalli <raghuveer.devulapa...@intel.com>
Discussion: https://postgr.es/m/bl1pr11mb530401fa7e9b1ca432cf9dc3dc...@bl1pr11mb5304.namprd11.prod.outlook.com
Discussion: https://postgr.es/m/ph8pr11mb82869ff741dfa4e9a029ff13fb...@ph8pr11mb8286.namprd11.prod.outlook.com
---
 config/c-compiler.m4              | 37 +++++++++++++++
 configure                         | 72 +++++++++++++++++++++++++++--
 configure.ac                      | 20 +++++++--
 meson.build                       | 55 ++++++++++++++++++-----
 src/include/pg_config.h.in        |  3 ++
 src/include/port/pg_crc32c.h      | 39 +++++++++++-----
 src/port/meson.build              |  1 +
 src/port/pg_crc32c_sse42.c        | 75 +++++++++++++++++++++++++++++++
 src/port/pg_crc32c_sse42_choose.c | 75 ++++++++++++++++++++++++-------
 9 files changed, 333 insertions(+), 44 deletions(-)

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 3712e81e38c..52b65406f88 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -581,6 +581,43 @@ fi
 undefine([Ac_cachevar])dnl
 ])# PGAC_SSE42_CRC32_INTRINSICS
 
+# PGAC_AVX512_CRC32_INTRINSICS
+# ---------------------------
+# Check if the compiler supports the carryless multiplication
+# and AVX-512VL instructions used for computing CRC32C with
+# 512-bit registers. AVX-512F is assumed to be supported.
+#
+# If the intrinsics are supported, sets pgac_avx512_crc32_intrinsics.
+AC_DEFUN([PGAC_AVX512_CRC32_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_crc32_intrinsics])])dnl
+AC_CACHE_CHECK([for _mm512_clmulepi64_epi128], [Ac_cachevar],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>
+    __m512i x;
+    __m512i y;
+
+    #if defined(__has_attribute) && __has_attribute (target)
+    __attribute__((target("vpclmulqdq,avx512vl")))
+    #endif
+    static int crc32_avx512_test(void)
+    {
+      __m128i z;
+
+      y = _mm512_clmulepi64_epi128(x, y, 0);
+      z = _mm_ternarylogic_epi64(
+                _mm512_castsi512_si128(y),
+                _mm512_extracti32x4_epi32(y, 1),
+                _mm512_extracti32x4_epi32(y, 2),
+                0x96);
+      return _mm_crc32_u64(0, _mm_extract_epi64(z, 0));
+    }],
+  [return crc32_avx512_test();])],
+  [Ac_cachevar=yes],
+  [Ac_cachevar=no])])
+if test x"$Ac_cachevar" = x"yes"; then
+  pgac_avx512_crc32_intrinsics=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_AVX512_CRC32_INTRINSICS
 
 # PGAC_ARMV8_CRC32C_INTRINSICS
 # ----------------------------
diff --git a/configure b/configure
index fac1e9a4e39..abbb151549c 100755
--- a/configure
+++ b/configure
@@ -17378,6 +17378,59 @@ $as_echo "#define USE_AVX512_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h
   fi
 fi
 
+# Check for intrinsics to do vectorized CRC calculations.
+#
+if test x"$host_cpu" = x"x86_64"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_clmulepi64_epi128" >&5
+$as_echo_n "checking for _mm512_clmulepi64_epi128... " >&6; }
+if ${pgac_cv_avx512_crc32_intrinsics+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <immintrin.h>
+    __m512i x;
+    __m512i y;
+
+    #if defined(__has_attribute) && __has_attribute (target)
+    __attribute__((target("vpclmulqdq,avx512vl")))
+    #endif
+    static int crc32_avx512_test(void)
+    {
+      __m128i z;
+
+      y = _mm512_clmulepi64_epi128(x, y, 0);
+      z = _mm_ternarylogic_epi64(
+                _mm512_castsi512_si128(y),
+                _mm512_extracti32x4_epi32(y, 1),
+                _mm512_extracti32x4_epi32(y, 2),
+                0x96);
+      return _mm_crc32_u64(0, _mm_extract_epi64(z, 0));
+    }
+int
+main ()
+{
+return crc32_avx512_test();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_avx512_crc32_intrinsics=yes
+else
+  pgac_cv_avx512_crc32_intrinsics=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_crc32_intrinsics" >&5
+$as_echo "$pgac_cv_avx512_crc32_intrinsics" >&6; }
+if test x"$pgac_cv_avx512_crc32_intrinsics" = x"yes"; then
+  pgac_avx512_crc32_intrinsics=yes
+fi
+
+fi
+
 # Check for Intel SSE 4.2 intrinsics to do CRC calculations.
 #
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32" >&5
@@ -17622,9 +17675,8 @@ fi
 # If we are targeting a processor that has Intel SSE 4.2 instructions, we can
 # use the special CRC instructions for calculating CRC-32C. If we're not
 # targeting such a processor, but we can nevertheless produce code that uses
-# the SSE intrinsics, compile both implementations and select which one to use
-# at runtime, depending on whether SSE 4.2 is supported by the processor we're
-# running on.
+# the SSE/AVX-512 intrinsics compile both implementations and select which one
+# to use at runtime, depending runtime cpuid information.
 #
 # Similarly, if we are targeting an ARM processor that has the CRC
 # instructions that are part of the ARMv8 CRC Extension, use them. And if
@@ -17680,7 +17732,7 @@ if test x"$USE_SSE42_CRC32C" = x"1"; then
 
 $as_echo "#define USE_SSE42_CRC32C 1" >>confdefs.h
 
-  PG_CRC32C_OBJS="pg_crc32c_sse42.o"
+  PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o"
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5
 $as_echo "SSE 4.2" >&6; }
 else
@@ -17729,6 +17781,18 @@ $as_echo "slicing-by-8" >&6; }
 fi
 
 
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking which vectorized CRC-32C implementation to use" >&5
+$as_echo_n "checking which vectorized CRC-32C implementation to use... " >&6; }
+if test x"$pgac_avx512_crc32_intrinsics" = x"yes"; then
+
+$as_echo "#define USE_AVX512_CRC_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: AVX-512 with runtime check" >&5
+$as_echo "AVX-512 with runtime check" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: none" >&5
+$as_echo "none" >&6; }
+fi
 
 # Select semaphore implementation type.
 if test "$PORTNAME" != "win32"; then
diff --git a/configure.ac b/configure.ac
index b6d02f5ecc7..a037b8cd566 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2057,6 +2057,12 @@ if test x"$host_cpu" = x"x86_64"; then
   fi
 fi
 
+# Check for intrinsics to do vectorized CRC calculations.
+#
+if test x"$host_cpu" = x"x86_64"; then
+  PGAC_AVX512_CRC32_INTRINSICS()
+fi
+
 # Check for Intel SSE 4.2 intrinsics to do CRC calculations.
 #
 PGAC_SSE42_CRC32_INTRINSICS()
@@ -2096,9 +2102,8 @@ AC_SUBST(CFLAGS_CRC)
 # If we are targeting a processor that has Intel SSE 4.2 instructions, we can
 # use the special CRC instructions for calculating CRC-32C. If we're not
 # targeting such a processor, but we can nevertheless produce code that uses
-# the SSE intrinsics, compile both implementations and select which one to use
-# at runtime, depending on whether SSE 4.2 is supported by the processor we're
-# running on.
+# the SSE/AVX-512 intrinsics compile both implementations and select which one
+# to use at runtime, depending runtime cpuid information.
 #
 # Similarly, if we are targeting an ARM processor that has the CRC
 # instructions that are part of the ARMv8 CRC Extension, use them. And if
@@ -2151,7 +2156,7 @@ fi
 AC_MSG_CHECKING([which CRC-32C implementation to use])
 if test x"$USE_SSE42_CRC32C" = x"1"; then
   AC_DEFINE(USE_SSE42_CRC32C, 1, [Define to 1 use Intel SSE 4.2 CRC instructions.])
-  PG_CRC32C_OBJS="pg_crc32c_sse42.o"
+  PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o"
   AC_MSG_RESULT(SSE 4.2)
 else
   if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
@@ -2184,6 +2189,13 @@ else
 fi
 AC_SUBST(PG_CRC32C_OBJS)
 
+AC_MSG_CHECKING([which vectorized CRC-32C implementation to use])
+if test x"$pgac_avx512_crc32_intrinsics" = x"yes"; then
+  AC_DEFINE(USE_AVX512_CRC_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX-512 CRC instructions with a runtime check.])
+  AC_MSG_RESULT(AVX-512 with runtime check)
+else
+  AC_MSG_RESULT(none)
+fi
 
 # Select semaphore implementation type.
 if test "$PORTNAME" != "win32"; then
diff --git a/meson.build b/meson.build
index 7cf518a2765..38736bcf853 100644
--- a/meson.build
+++ b/meson.build
@@ -2288,17 +2288,22 @@ endif
 ###############################################################
 # Select CRC-32C implementation.
 #
-# If we are targeting a processor that has Intel SSE 4.2 instructions, we can
-# use the special CRC instructions for calculating CRC-32C. If we're not
-# targeting such a processor, but we can nevertheless produce code that uses
-# the SSE intrinsics, compile both implementations and select which one to use
-# at runtime, depending on whether SSE 4.2 is supported by the processor we're
-# running on.
+# There are three methods of calculating CRC, in order of increasing
+# performance:
 #
-# Similarly, if we are targeting an ARM processor that has the CRC
-# instructions that are part of the ARMv8 CRC Extension, use them. And if
-# we're not targeting such a processor, but can nevertheless produce code that
-# uses the CRC instructions, compile both, and select at runtime.
+# 1. The fallback using a lookup table, called slicing-by-8
+# 2. CRC-32C instructions on word-sized registers (e.g. Intel SSE 4.2
+# and ARMv8 CRC Extension)
+# 3. Algorithms that have at their core carryless multiplication
+# instructions (called PCLMUL or PMULL) on vector registers.
+#
+# If we can produce code (via function attributes or additional compiler
+# flags) that uses #2 (and possibly #3), we compile all implementations
+# and select which one to use at runtime, depending on what is supported
+# by the processor we're running on.
+#
+# If we are targeting a processor that has #2, we can use that without
+# runtime selection.
 #
 # Note that we do not use __attribute__((target("..."))) for the ARM CRC
 # instructions because until clang 16, using the ARM intrinsics still requires
@@ -2347,6 +2352,36 @@ int main(void)
       have_optimized_crc = true
     endif
 
+    # Test for PCLMUL intrinsics on 512-bit registers
+    prog = '''
+#include <immintrin.h>
+__m512i x;
+__m512i y;
+
+#if defined(__has_attribute) && __has_attribute (target)
+__attribute__((target("avx512vl,vpclmulqdq")))
+#endif
+int main(void)
+{
+     __m128i z;
+
+    y = _mm512_clmulepi64_epi128(x, y, 0);
+    z = _mm_ternarylogic_epi64(
+            _mm512_castsi512_si128(y),
+            _mm512_extracti32x4_epi32(y, 1),
+            _mm512_extracti32x4_epi32(y, 2),
+            0x96);
+    /* return computed value, to prevent the above being optimized away */
+    return _mm_crc32_u64(0, _mm_extract_epi64(z, 0));
+}
+'''
+
+    if cc.links(prog,
+        name: 'AVX-512 CRC32C',
+        args: test_c_args)
+      cdata.set('USE_AVX512_CRC32C_WITH_RUNTIME_CHECK', 1)
+    endif
+
   endif
 
 elif host_cpu == 'arm' or host_cpu == 'aarch64'
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index db6454090d2..5bd5a927742 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -651,6 +651,9 @@
 /* Define to 1 to build with assertion checks. (--enable-cassert) */
 #undef USE_ASSERT_CHECKING
 
+/* Define to 1 to use AVX-512 CRC instructions with a runtime check. */
+#undef USE_AVX512_CRC_WITH_RUNTIME_CHECK
+
 /* Define to 1 to use AVX-512 popcount instructions with a runtime check. */
 #undef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
 
diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h
index 0ab7513f523..17dec5f6007 100644
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -42,7 +42,10 @@ typedef uint32 pg_crc32c;
 #define EQ_CRC32C(c1, c2) ((c1) == (c2))
 
 #if defined(USE_SSE42_CRC32C)
-/* Use Intel SSE4.2 instructions. */
+/*
+ * Use either Intel SSE 4.2 or PCLMUL instructions. We don't need a runtime check
+ * for SSE 4.2, so we can inline those in some cases.
+ */
 
 #include <nmmintrin.h>
 
@@ -50,7 +53,11 @@ typedef uint32 pg_crc32c;
 	((crc) = pg_comp_crc32c_dispatch((crc), (data), (len)))
 #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
 
+extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
 extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
+#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
+extern pg_crc32c pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t len);
+#endif
 
 pg_attribute_no_sanitize_alignment()
 static inline
@@ -76,9 +83,27 @@ pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len)
 		return crc;
 	}
 	else
-		return pg_comp_crc32c_sse42(crc, data, len);
+		/* Otherwise, use a runtime check for PCLMUL instructions. */
+		return pg_comp_crc32c(crc, data, len);
 }
 
+#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK)
+
+/*
+ * Use Intel SSE 4.2 or PCLMUL instructions, but perform a runtime check first
+ * to check that they are available.
+ */
+#define COMP_CRC32C(crc, data, len) \
+	((crc) = pg_comp_crc32c((crc), (data), (len)))
+#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
+
+extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
+extern PGDLLIMPORT pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
+extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
+#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
+extern pg_crc32c pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t len);
+#endif
+
 #elif defined(USE_ARMV8_CRC32C)
 /* Use ARMv8 CRC Extension instructions. */
 
@@ -97,10 +122,10 @@ extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t le
 
 extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len);
 
-#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
+#elif defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
 
 /*
- * Use Intel SSE 4.2 or ARMv8 instructions, but perform a runtime check first
+ * Use ARMv8 instructions, but perform a runtime check first
  * to check that they are available.
  */
 #define COMP_CRC32C(crc, data, len) \
@@ -109,13 +134,7 @@ extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_
 
 extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
 extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
-
-#ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
-extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
-#endif
-#ifdef USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK
 extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
-#endif
 
 #else
 /*
diff --git a/src/port/meson.build b/src/port/meson.build
index 7fcfa728d43..8d70a4d510e 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -83,6 +83,7 @@ replace_funcs_pos = [
   # x86/x64
   ['pg_crc32c_sse42', 'USE_SSE42_CRC32C'],
   ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
+  ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C'],
   ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
   ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
 
diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index 22c2137df31..f392eb5b236 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -15,6 +15,7 @@
 #include "c.h"
 
 #include <nmmintrin.h>
+#include <immintrin.h>
 
 #include "port/pg_crc32c.h"
 
@@ -68,3 +69,77 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
 
 	return crc;
 }
+
+#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
+
+/* Generated by https://github.com/corsix/fast-crc32/ using: */
+/* ./generate -i avx512_vpclmulqdq -p crc32c -a v1e */
+/* MIT licensed */
+
+#define clmul_lo(a, b) (_mm512_clmulepi64_epi128((a), (b), 0))
+#define clmul_hi(a, b) (_mm512_clmulepi64_epi128((a), (b), 17))
+
+pg_attribute_target("avx512vl,vpclmulqdq")
+pg_crc32c
+pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t length)
+{
+	/* adjust names to match generated code */
+	pg_crc32c	crc0 = crc;
+	size_t		len = length;
+	const char *buf = data;
+
+	/* Align on cacheline boundary. WIP: The threshold needs testing. */
+	if (unlikely(len > 256))
+	{
+		for (; len && ((uintptr_t) buf & 7); --len)
+		{
+			crc0 = _mm_crc32_u8(crc0, *buf++);
+		}
+		while (((uintptr_t) buf & 56) && len >= 8)
+		{
+			crc0 = _mm_crc32_u64(crc0, *(const uint64_t *) buf);
+			buf += 8;
+			len -= 8;
+		}
+	}
+
+	if (len >= 64)
+	{
+		const char *end = buf + len;
+		const char *limit = buf + len - 64;
+		__m128i		z0;
+
+		/* First vector chunk. */
+		__m512i		x0 = _mm512_loadu_si512((const void *) buf),
+					y0;
+		__m512i		k;
+
+		k = _mm512_broadcast_i32x4(_mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0));
+		x0 = _mm512_xor_si512(_mm512_castsi128_si512(_mm_cvtsi32_si128(crc0)), x0);
+		buf += 64;
+
+		/* Main loop. */
+		while (buf <= limit)
+		{
+			y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+			x0 = _mm512_ternarylogic_epi64(x0, y0, _mm512_loadu_si512((const void *) buf), 0x96);
+			buf += 64;
+		}
+
+		/* Reduce 512 bits to 128 bits. */
+		k = _mm512_setr_epi32(0x1c291d04, 0, 0xddc0152b, 0, 0x3da6d0cb, 0, 0xba4fc28e, 0, 0xf20c0dfe, 0, 0x493c7d27, 0, 0, 0, 0, 0);
+		y0 = clmul_lo(x0, k), k = clmul_hi(x0, k);
+		y0 = _mm512_xor_si512(y0, k);
+		z0 = _mm_ternarylogic_epi64(_mm512_castsi512_si128(y0), _mm512_extracti32x4_epi32(y0, 1), _mm512_extracti32x4_epi32(y0, 2), 0x96);
+		z0 = _mm_xor_si128(z0, _mm512_extracti32x4_epi32(x0, 3));
+
+		/* Reduce 128 bits to 32 bits, and multiply by x^32. */
+		crc0 = _mm_crc32_u64(0, _mm_extract_epi64(z0, 0));
+		crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(z0, 1));
+		len = end - buf;
+	}
+
+	return pg_comp_crc32c_sse42(crc0, buf, len);
+}
+
+#endif
diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_crc32c_sse42_choose.c
index 65dbc4d4249..c2d25253c2c 100644
--- a/src/port/pg_crc32c_sse42_choose.c
+++ b/src/port/pg_crc32c_sse42_choose.c
@@ -20,30 +20,35 @@
 
 #include "c.h"
 
-#ifdef HAVE__GET_CPUID
+#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
 #include <cpuid.h>
 #endif
 
-#ifdef HAVE__CPUID
+#include <immintrin.h>
+
+#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
 #include <intrin.h>
 #endif
 
 #include "port/pg_crc32c.h"
 
-static bool
-pg_crc32c_sse42_available(void)
+/*
+ * Does XGETBV say the ZMM registers are enabled?
+ *
+ * NB: Caller is responsible for verifying that osxsave is available
+ * before calling this.
+ */
+#ifdef HAVE_XSAVE_INTRINSICS
+pg_attribute_target("xsave")
+#endif
+static inline bool
+zmm_regs_available(void)
 {
-	unsigned int exx[4] = {0, 0, 0, 0};
-
-#if defined(HAVE__GET_CPUID)
-	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
-#elif defined(HAVE__CPUID)
-	__cpuid(exx, 1);
+#ifdef HAVE_XSAVE_INTRINSICS
+	return (_xgetbv(0) & 0xe6) == 0xe6;
 #else
-#error cpuid instruction not available
+	return false;
 #endif
-
-	return (exx[2] & (1 << 20)) != 0;	/* SSE 4.2 */
 }
 
 /*
@@ -53,10 +58,48 @@ pg_crc32c_sse42_available(void)
 static pg_crc32c
 pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
 {
-	if (pg_crc32c_sse42_available())
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+#ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
+
+	/*
+	 * Set fallback. We must guard since slicing-by-8 is not visible
+	 * everywhere.
+	 */
+	pg_comp_crc32c = pg_comp_crc32c_sb8;
+#endif
+
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+	__cpuid(exx, 1);
+#else
+#error cpuid instruction not available
+#endif
+
+	if ((exx[2] & (1 << 20)) != 0)	/* SSE 4.2 */
+	{
 		pg_comp_crc32c = pg_comp_crc32c_sse42;
-	else
-		pg_comp_crc32c = pg_comp_crc32c_sb8;
+
+		if (exx[2] & (1 << 27) &&	/* OSXSAVE */
+			zmm_regs_available())
+		{
+#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
+			/* second cpuid call on leaf 7 to check extended avx512 support */
+
+			memset(exx, 0, 4 * sizeof(exx[0]));
+
+#if defined(HAVE__GET_CPUID_COUNT)
+			__get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUIDEX)
+			__cpuidex(exx, 7, 0);
+#endif
+			if (exx[2] & (1 << 10) &&	/* VPCLMULQDQ */
+				exx[1] & (1 << 31)) /* AVX512-VL */
+				pg_comp_crc32c = pg_comp_crc32c_pclmul;
+#endif
+		}
+	}
 
 	return pg_comp_crc32c(crc, data, len);
 }
-- 
2.48.1

Re: Improve CRC32C performance on SSE4.2

Reply via email to