On Thu, Apr 2, 2026 at 11:17 PM Nathan Bossart <[email protected]> wrote:
>
> On Thu, Apr 02, 2026 at 10:53:24AM -0500, Nathan Bossart wrote:
> > I think the new pg_comp_crc32_choose() is infinitely recursing on macOS
> > because USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK is not defined but
> > pg_crc32c_armv8_available() returns false.  If I trace through that
> > function, I see that it's going straight to the
> >
> >       #else
> >               return false;
> >       #endif
> >
> > at the end.  And sure enough, both HAVE_ELF_AUX_INFO and HAVE_GETAUXVAL

Ah of course.

> > aren't defined in pg_config.h.  I think we might need to use sysctlbyname()
> > to determine PMULL support on macOS, but at this stage of the development
> > cycle, I would probably lean towards just compiling in the sb8
> > implementation.
>
> Hm.  On second thought, that probably regresses macOS builds because it was
> presumably using the armv8 path without runtime checks before...

I went with the following for v5, and it passes MacOS on my Github CI:

+  /* set fallbacks */
+#ifdef USE_ARMV8_CRC32C
+  /* On e.g. MacOS, our runtime feature detection doesn't work */
+  pg_comp_crc32c = pg_comp_crc32c_armv8;
+#else
+  pg_comp_crc32c = pg_comp_crc32c_sb8;
+#endif
+ [...crc and pmull checks]

That should keep scalar hardware support working, but now it'll only
use direct calls for constant inputs.

I also did some benchmarking on an ARM Neoverse N1 / gcc 8.3
(attached). There the vector loop still works well all the way down to
the minimum input size of 64 bytes, and on long inputs it's almost
twice as fast as scalar. For reproduceability, I slightly modified the
benchmark we used last year, to make sure the input is aligned
(attached but not for CI). In the end, I want to add a length check so
that inputs smaller than 80 bytes go straight to the scalar path.
Above 80, after alignment adjustments in the preamble, that still
guarantees at least one loop iteration in the vector path.

--
John Naylor
Amazon Web Services
master:

32
latency average = 9.758 ms
latency average = 9.449 ms
latency average = 9.475 ms
64
latency average = 12.131 ms
latency average = 12.276 ms
latency average = 12.200 ms
96
latency average = 14.837 ms
latency average = 14.834 ms
latency average = 14.844 ms
128
latency average = 17.589 ms
latency average = 17.499 ms
latency average = 17.516 ms
160
latency average = 20.177 ms
latency average = 20.292 ms
latency average = 20.176 ms
192
latency average = 22.895 ms
latency average = 22.879 ms
latency average = 22.860 ms
224
latency average = 25.625 ms
latency average = 25.609 ms
latency average = 25.600 ms
256
latency average = 28.206 ms
latency average = 28.213 ms
latency average = 28.275 ms
8192
latency average = 698.934 ms
latency average = 712.760 ms
latency average = 700.519 ms

v5:

32
latency average = 10.099 ms
latency average = 10.241 ms
latency average = 10.209 ms
64
latency average = 10.260 ms
latency average = 10.232 ms
latency average = 10.220 ms
96
latency average = 12.234 ms
latency average = 12.297 ms
latency average = 12.475 ms
128
latency average = 13.149 ms
latency average = 13.380 ms
latency average = 13.093 ms
160
latency average = 14.228 ms
latency average = 13.829 ms
latency average = 14.450 ms
192
latency average = 15.383 ms
latency average = 15.171 ms
latency average = 15.173 ms
224
latency average = 16.908 ms
latency average = 16.746 ms
latency average = 16.546 ms
256
latency average = 18.130 ms
latency average = 18.271 ms
latency average = 18.150 ms
8192
latency average = 410.864 ms
latency average = 408.692 ms
latency average = 401.145 ms

Attachment: test-crc.sh
Description: application/shellscript

From 8adcb9c0305bffb1ce33cab3e4ceec984d72b351 Mon Sep 17 00:00:00 2001
From: John Naylor <[email protected]>
Date: Tue, 31 Mar 2026 17:40:38 +0700
Subject: [PATCH v5] Compute CRC32C on ARM using the Crypto Extension where
 available

---
 config/c-compiler.m4              |  41 ++++++++++
 configure                         |  67 +++++++++++++++-
 configure.ac                      |  13 ++-
 meson.build                       |  33 ++++++++
 src/include/pg_config.h.in        |   3 +
 src/include/port/pg_crc32c.h      |  22 ++++--
 src/port/meson.build              |   1 +
 src/port/pg_crc32c_armv8.c        | 127 ++++++++++++++++++++++++++++++
 src/port/pg_crc32c_armv8_choose.c |  38 ++++++++-
 9 files changed, 334 insertions(+), 11 deletions(-)

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 629572ee350..f8a9a69f20a 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -759,6 +759,47 @@ fi
 undefine([Ac_cachevar])dnl
 ])# PGAC_ARMV8_CRC32C_INTRINSICS
 
+# PGAC_ARM_PLMULL
+# ---------------------------
+# Check if the compiler supports Arm CRYPTO PMULL (carryless multiplication)
+# instructions used for vectorized CRC.
+#
+# If the instructions are supported, sets pgac_arm_pmull.
+AC_DEFUN([PGAC_ARM_PLMULL],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_arm_pmull_$1])])dnl
+AC_CACHE_CHECK([for pmull and pmull2], [Ac_cachevar],
+[pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS"
+AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <arm_acle.h>
+#include <arm_neon.h>
+uint64x2_t  a;
+uint64x2_t  b;
+uint64x2_t  c;
+uint64x2_t  r1;
+uint64x2_t  r2;
+
+  #if defined(__has_attribute) && __has_attribute (target)
+  __attribute__((target("+crypto")))
+  #endif
+  static int pmull_test(void)
+  {
+    __asm("pmull  %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r1), "+w"(c):"w"(a), "w"(b));
+    __asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r2), "+w"(c):"w"(a), "w"(b));
+
+    r1 = veorq_u64(r1, r2);
+    /* return computed value, to prevent the above being optimized away */
+    return (int) vgetq_lane_u64(r1, 0);
+  }],
+  [return pmull_test();])],
+  [Ac_cachevar=yes],
+  [Ac_cachevar=no])
+CFLAGS="$pgac_save_CFLAGS"])
+if test x"$Ac_cachevar" = x"yes"; then
+  pgac_arm_pmull=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_ARM_PLMULL
+
 # PGAC_LOONGARCH_CRC32C_INTRINSICS
 # ---------------------------
 # Check if the compiler supports the LoongArch CRCC instructions, using
diff --git a/configure b/configure
index fe22bc71d0c..a1ed54d2439 100755
--- a/configure
+++ b/configure
@@ -18314,7 +18314,7 @@ $as_echo "SSE 4.2 with runtime check" >&6; }
 
 $as_echo "#define USE_ARMV8_CRC32C 1" >>confdefs.h
 
-      PG_CRC32C_OBJS="pg_crc32c_armv8.o"
+      PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_armv8_choose.o"
       { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions" >&5
 $as_echo "ARMv8 CRC instructions" >&6; }
     else
@@ -18399,6 +18399,61 @@ if test x"$pgac_cv_avx512_pclmul_intrinsics" = x"yes"; then
   pgac_avx512_pclmul_intrinsics=yes
 fi
 
+else
+  if test x"$host_cpu" = x"aarch64"; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for pmull and pmull2" >&5
+$as_echo_n "checking for pmull and pmull2... " >&6; }
+if ${pgac_cv_arm_pmull_+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <arm_acle.h>
+#include <arm_neon.h>
+uint64x2_t  a;
+uint64x2_t  b;
+uint64x2_t  c;
+uint64x2_t  r1;
+uint64x2_t  r2;
+
+  #if defined(__has_attribute) && __has_attribute (target)
+  __attribute__((target("+crypto")))
+  #endif
+  static int pmull_test(void)
+  {
+    __asm("pmull  %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r1), "+w"(c):"w"(a), "w"(b));
+    __asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r2), "+w"(c):"w"(a), "w"(b));
+
+    r1 = veorq_u64(r1, r2);
+    /* return computed value, to prevent the above being optimized away */
+    return (int) vgetq_lane_u64(r1, 0);
+  }
+int
+main ()
+{
+return pmull_test();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_arm_pmull_=yes
+else
+  pgac_cv_arm_pmull_=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_arm_pmull_" >&5
+$as_echo "$pgac_cv_arm_pmull_" >&6; }
+if test x"$pgac_cv_arm_pmull_" = x"yes"; then
+  pgac_arm_pmull=yes
+fi
+
+  fi
 fi
 
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for vectorized CRC-32C" >&5
@@ -18410,8 +18465,16 @@ $as_echo "#define USE_AVX512_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: AVX-512 with runtime check" >&5
 $as_echo "AVX-512 with runtime check" >&6; }
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: none" >&5
+  if test x"$pgac_arm_pmull" = x"yes"; then
+
+$as_echo "#define USE_PMULL_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: CRYPTO PMULL with runtime check" >&5
+$as_echo "CRYPTO PMULL with runtime check" >&6; }
+  else
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: none" >&5
 $as_echo "none" >&6; }
+  fi
 fi
 
 # Select semaphore implementation type.
diff --git a/configure.ac b/configure.ac
index 6873b7546dd..afea6118eb1 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2268,7 +2268,7 @@ else
   else
     if test x"$USE_ARMV8_CRC32C" = x"1"; then
       AC_DEFINE(USE_ARMV8_CRC32C, 1, [Define to 1 to use ARMv8 CRC Extension.])
-      PG_CRC32C_OBJS="pg_crc32c_armv8.o"
+      PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_armv8_choose.o"
       AC_MSG_RESULT(ARMv8 CRC instructions)
     else
       if test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
@@ -2295,6 +2295,10 @@ AC_SUBST(PG_CRC32C_OBJS)
 #
 if test x"$host_cpu" = x"x86_64"; then
   PGAC_AVX512_PCLMUL_INTRINSICS()
+else
+  if test x"$host_cpu" = x"aarch64"; then
+    PGAC_ARM_PLMULL()
+  fi
 fi
 
 AC_MSG_CHECKING([for vectorized CRC-32C])
@@ -2302,7 +2306,12 @@ if test x"$pgac_avx512_pclmul_intrinsics" = x"yes"; then
   AC_DEFINE(USE_AVX512_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX-512 CRC algorithms with a runtime check.])
   AC_MSG_RESULT(AVX-512 with runtime check)
 else
-  AC_MSG_RESULT(none)
+  if test x"$pgac_arm_pmull" = x"yes"; then
+    AC_DEFINE(USE_PMULL_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Arm PMULL CRC algorithms with a runtime check.])
+    AC_MSG_RESULT(CRYPTO PMULL with runtime check)
+  else
+    AC_MSG_RESULT(none)
+  fi
 fi
 
 # Select semaphore implementation type.
diff --git a/meson.build b/meson.build
index 6bc74c2ba79..6d7f697583f 100644
--- a/meson.build
+++ b/meson.build
@@ -2720,6 +2720,39 @@ int main(void)
     have_optimized_crc = true
   endif
 
+    # Check if the compiler supports Arm CRYPTO PMULL (carryless multiplication)
+    # instructions used for vectorized CRC.
+    prog = '''
+#include <arm_acle.h>
+#include <arm_neon.h>
+uint64x2_t	a;
+uint64x2_t	b;
+uint64x2_t	c;
+
+#if defined(__has_attribute) && __has_attribute (target)
+__attribute__((target("+crypto")))
+#endif
+int main(void)
+{
+    uint64x2_t	r1;
+    uint64x2_t	r2;
+
+	__asm("pmull  %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r1), "+w"(c):"w"(a), "w"(b));
+	__asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r2), "+w"(c):"w"(a), "w"(b));
+
+	r1 = veorq_u64(r1, r2);
+	/* return computed value, to prevent the above being optimized away */
+	return (int) vgetq_lane_u64(r1, 0);
+}
+'''
+
+  if cc.links(prog,
+      name: 'CRYPTO CRC32C',
+      args: test_c_args)
+    # Use ARM CRYPTO Extension, with runtime check
+    cdata.set('USE_PMULL_CRC32C_WITH_RUNTIME_CHECK', 1)
+  endif
+
 elif host_cpu == 'loongarch64'
 
   prog = '''
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index d8d61918aff..dbc97c565a3 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -729,6 +729,9 @@
 /* Define to 1 to build with PAM support. (--with-pam) */
 #undef USE_PAM
 
+/* Define to 1 to use Arm PMULL CRC algorithms with a runtime check. */
+#undef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+
 /* Define to 1 to use software CRC-32C implementation (slicing-by-8). */
 #undef USE_SLICING_BY_8_CRC32C
 
diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h
index 1f8e837d119..10518614664 100644
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -111,13 +111,22 @@ extern pg_crc32c pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t l
 #endif
 
 #elif defined(USE_ARMV8_CRC32C)
-/* Use ARMv8 CRC Extension instructions. */
-
+/*
+ * Use either ARMv8 CRC Extension or CRYPTO Extension (PMULL) instructions.
+ * We don't need a runtime check for CRC, so for constant inputs, where
+ * we assume the input is small, we can avoid an indirect function call.
+ */
 #define COMP_CRC32C(crc, data, len)							\
-	((crc) = pg_comp_crc32c_armv8((crc), (data), (len)))
+	((crc) = __builtin_constant_p(len) ? 					\
+		pg_comp_crc32c_armv8((crc), (data), (len)) : 		\
+		pg_comp_crc32c((crc), (data), (len)))
 #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
 
+extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
 extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+extern pg_crc32c pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len);
+#endif
 
 #elif defined(USE_LOONGARCH_CRC32C)
 /* Use LoongArch CRCC instructions. */
@@ -131,8 +140,8 @@ extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_
 #elif defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
 
 /*
- * Use ARMv8 instructions, but perform a runtime check first
- * to check that they are available.
+ * Use either ARMv8 CRC Extension or CRYPTO Extension (PMULL) instructions,
+ * but perform a runtime check first to check that they are available.
  */
 #define COMP_CRC32C(crc, data, len) \
 	((crc) = pg_comp_crc32c((crc), (data), (len)))
@@ -141,6 +150,9 @@ extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_
 extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
 extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
 extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+extern pg_crc32c pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len);
+#endif
 
 #else
 /*
diff --git a/src/port/meson.build b/src/port/meson.build
index d55cb0424f3..922b3f64676 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -93,6 +93,7 @@ replace_funcs_pos = [
   # arm / aarch64
   ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'],
   ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
+  ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C'],
   ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
   ['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
 
diff --git a/src/port/pg_crc32c_armv8.c b/src/port/pg_crc32c_armv8.c
index 9ca0f728d39..aa0089a431c 100644
--- a/src/port/pg_crc32c_armv8.c
+++ b/src/port/pg_crc32c_armv8.c
@@ -20,6 +20,10 @@
 #include <arm_acle.h>
 #endif
 
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+#include <arm_neon.h>
+#endif
+
 #include "port/pg_crc32c.h"
 
 pg_crc32c
@@ -77,3 +81,126 @@ pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len)
 
 	return crc;
 }
+
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+
+/*
+ * Note: There is no copyright notice in the following generated code.
+ *
+ * We have modified the output to
+ *   - match our function declaration
+ *   - match whitespace to our project style
+ *   - be more friendly for pgindent
+ */
+
+/* Generated by https://github.com/corsix/fast-crc32/ using: */
+/* ./generate -i neon -p crc32c -a v4e */
+/* MIT licensed */
+
+pg_attribute_target("+crypto")
+static inline
+uint64x2_t
+clmul_lo_e(uint64x2_t a, uint64x2_t b, uint64x2_t c)
+{
+	uint64x2_t	r;
+
+__asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b));
+	return r;
+}
+
+pg_attribute_target("+crypto")
+static inline
+uint64x2_t
+clmul_hi_e(uint64x2_t a, uint64x2_t b, uint64x2_t c)
+{
+	uint64x2_t	r;
+
+__asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b));
+	return r;
+}
+
+pg_attribute_target("+crypto")
+pg_crc32c
+pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len)
+{
+	/* adjust names to match generated code */
+	pg_crc32c	crc0 = crc;
+	const char *buf = data;
+
+	/* align to 16 bytes */
+	for (; len && ((uintptr_t) buf & 7); --len)
+	{
+		crc0 = __crc32cb(crc0, *buf++);
+	}
+	if (((uintptr_t) buf & 8) && len >= 8)
+	{
+		crc0 = __crc32cd(crc0, *(const uint64_t *) buf);
+		buf += 8;
+		len -= 8;
+	}
+
+	if (len >= 64)
+	{
+		const char *end = buf + len;
+		const char *limit = buf + len - 64;
+
+		/* First vector chunk. */
+		uint64x2_t	x0 = vld1q_u64((const uint64_t *) buf),
+					y0;
+		uint64x2_t	x1 = vld1q_u64((const uint64_t *) (buf + 16)),
+					y1;
+		uint64x2_t	x2 = vld1q_u64((const uint64_t *) (buf + 32)),
+					y2;
+		uint64x2_t	x3 = vld1q_u64((const uint64_t *) (buf + 48)),
+					y3;
+		uint64x2_t	k;
+
+		{
+			static const uint64_t pg_attribute_aligned(16) k_[] = {0x740eef02, 0x9e4addf8};
+
+			k = vld1q_u64(k_);
+		}
+
+		/*
+		 * pgindent complained of unmatched parens upstream:
+		 *
+		 * x0 = veorq_u64((uint64x2_t) {crc0, 0}, x0);
+		 */
+		x0 = veorq_u64((uint64x2_t) vsetq_lane_u64(crc0, vdupq_n_u64(0), 0), x0);
+		buf += 64;
+
+		/* Main loop. */
+		while (buf <= limit)
+		{
+			y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t *) buf)), x0 = clmul_hi_e(x0, k, y0);
+			y1 = clmul_lo_e(x1, k, vld1q_u64((const uint64_t *) (buf + 16))), x1 = clmul_hi_e(x1, k, y1);
+			y2 = clmul_lo_e(x2, k, vld1q_u64((const uint64_t *) (buf + 32))), x2 = clmul_hi_e(x2, k, y2);
+			y3 = clmul_lo_e(x3, k, vld1q_u64((const uint64_t *) (buf + 48))), x3 = clmul_hi_e(x3, k, y3);
+			buf += 64;
+		}
+
+		/* Reduce x0 ... x3 to just x0. */
+		{
+			static const uint64_t pg_attribute_aligned(16) k_[] = {0xf20c0dfe, 0x493c7d27};
+
+			k = vld1q_u64(k_);
+		}
+		y0 = clmul_lo_e(x0, k, x1), x0 = clmul_hi_e(x0, k, y0);
+		y2 = clmul_lo_e(x2, k, x3), x2 = clmul_hi_e(x2, k, y2);
+		{
+			static const uint64_t pg_attribute_aligned(16) k_[] = {0x3da6d0cb, 0xba4fc28e};
+
+			k = vld1q_u64(k_);
+		}
+		y0 = clmul_lo_e(x0, k, x2), x0 = clmul_hi_e(x0, k, y0);
+
+		/* Reduce 128 bits to 32 bits, and multiply by x^32. */
+		crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0));
+		crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1));
+		len = end - buf;
+	}
+
+	return pg_comp_crc32c_armv8(crc0, buf, len);
+}
+
+#endif
diff --git a/src/port/pg_crc32c_armv8_choose.c b/src/port/pg_crc32c_armv8_choose.c
index a1f0e540c6b..72d70aea1e1 100644
--- a/src/port/pg_crc32c_armv8_choose.c
+++ b/src/port/pg_crc32c_armv8_choose.c
@@ -108,6 +108,27 @@ pg_crc32c_armv8_available(void)
 #endif
 }
 
+static inline bool
+pg_pmull_available(void)
+{
+#if defined(__aarch64__) && defined(HWCAP_PMULL)
+
+#ifdef HAVE_ELF_AUX_INFO
+	unsigned long value;
+
+	return elf_aux_info(AT_HWCAP, &value, sizeof(value)) == 0 &&
+		(value & HWCAP_PMULL) != 0;
+#elif defined(HAVE_GETAUXVAL)
+	return (getauxval(AT_HWCAP) & HWCAP_PMULL) != 0;
+#else
+	return false;
+#endif
+
+#else
+	return false;
+#endif
+}
+
 /*
  * This gets called on the first call. It replaces the function pointer
  * so that subsequent calls are routed directly to the chosen implementation.
@@ -115,10 +136,23 @@ pg_crc32c_armv8_available(void)
 static pg_crc32c
 pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
 {
+	/* set fallbacks */
+#ifdef USE_ARMV8_CRC32C
+	/* On e.g. MacOS, our runtime feature detection doesn't work */
+	pg_comp_crc32c = pg_comp_crc32c_armv8;
+#else
+	pg_comp_crc32c = pg_comp_crc32c_sb8;
+#endif
+
 	if (pg_crc32c_armv8_available())
+	{
 		pg_comp_crc32c = pg_comp_crc32c_armv8;
-	else
-		pg_comp_crc32c = pg_comp_crc32c_sb8;
+
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+		if (pg_pmull_available())
+			pg_comp_crc32c = pg_comp_crc32c_pmull;
+#endif
+	}
 
 	return pg_comp_crc32c(crc, data, len);
 }
-- 
2.53.0

Attachment: v503-0002-Add-a-Postgres-SQL-function-for-crc32c-benchmar.patch.nocfbot
Description: Binary data

Reply via email to