On Thu, Feb 19, 2026 at 1:47 AM Zsolt Parragi <[email protected]> wrote:
>
> > Done. I haven't tried Arm support yet, but now I realize the header
> > should be named generically, so it's now "pg_cpu.h". Then it can be
> > included everywhere.
>
> That makes sense, and simplifies the usage of the header. (However,
> the include guard still refers to the old name)

Oops, fixed.

> > I don't know. The instruction family names are conventionally all in
> > caps, but this is just our signal that we've populated the array. That
> > said, a less generic name would better for grep-ability.
>
> Yes, that could work too. But reserving the lowercase "init" symbol in
> a very generic header seems like a bad idea (especially for a use case
> that isn't used globally), even if Postgres itself doesn't use the
> symbol for anything else. "INIT" at least would be unlikely to
> conflict with something else.

Still seems pretty generic, so I went with INIT_PG_X86.

I've also made a quick attempt at Arm support just to make sure I
didn't paint myself into a corner (v4-0005-6), and it compiles and
passes tests on a Debian aarch64 system with gcc 8.3. I'll put that
aside for later. v4-0001-3 are still the main focus now, and seem in
decent shape, maybe needs a bit more polish. (not to mention formal
commit messages)

--
John Naylor
Amazon Web Services
From 480fc1ad8ee8e0c5cecfd0ea7348b877feb77944 Mon Sep 17 00:00:00 2001
From: John Naylor <[email protected]>
Date: Thu, 12 Feb 2026 12:45:23 +0700
Subject: [PATCH v4 2/6] Centralize detection of CPU features

WIP: x86 only
---
 src/include/port/pg_cpu.h  | 50 +++++++++++++++++++++
 src/port/pg_cpu_x86.c      | 62 +++++++++++---------------
 src/port/pg_crc32c_sse42.c | 28 ++++++++++++
 src/port/pg_popcount_x86.c | 91 ++------------------------------------
 4 files changed, 107 insertions(+), 124 deletions(-)
 create mode 100644 src/include/port/pg_cpu.h

diff --git a/src/include/port/pg_cpu.h b/src/include/port/pg_cpu.h
new file mode 100644
index 00000000000..c9e03d016e9
--- /dev/null
+++ b/src/include/port/pg_cpu.h
@@ -0,0 +1,50 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_cpu.h
+ *	  Runtime CPU feature detection
+ *
+ * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/port/pg_cpu.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_CPU_H
+#define PG_CPU_H
+
+#if defined(USE_SSE2) || defined(__i386__)
+
+typedef enum X86FeatureId
+{
+	/* Have we run feature detection? */
+	INIT_PG_X86,
+
+	/* scalar and 128-bit registers */
+	PG_SSE4_2,
+	PG_POPCNT,
+
+	/* 512-bit registers */
+	PG_AVX512_BW,
+	PG_AVX512_VL,
+	PG_AVX512_VPCLMULQDQ,
+	PG_AVX512_VPOPCNTDQ,
+}			X86FeatureId;
+#define X86FeaturesSize (PG_AVX512_VPOPCNTDQ + 1)
+
+extern PGDLLIMPORT bool X86Features[];
+
+extern void set_x86_features(void);
+
+static inline bool
+x86_feature_available(X86FeatureId feature)
+{
+	if (X86Features[INIT_PG_X86] == false)
+		set_x86_features();
+
+	return X86Features[feature];
+}
+
+#endif							/* defined(USE_SSE2) || defined(__i386__) */
+
+#endif							/* PG_CPU_H */
diff --git a/src/port/pg_cpu_x86.c b/src/port/pg_cpu_x86.c
index 998a70ffa41..725bd1f68c8 100644
--- a/src/port/pg_cpu_x86.c
+++ b/src/port/pg_cpu_x86.c
@@ -1,12 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * pg_cpu_x86.c
- *	  Choose between Intel SSE 4.2 and software CRC-32C implementation.
- *
- * On first call, checks if the CPU we're running on supports Intel SSE
- * 4.2. If it does, use the special SSE instructions for CRC-32C
- * computation. Otherwise, fall back to the pure software implementation
- * (slicing-by-8).
+ *	  Runtime CPU feature detection for x86
  *
  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
@@ -34,7 +29,10 @@
 #include <immintrin.h>
 #endif
 
-#include "port/pg_crc32c.h"
+#include "port/pg_cpu.h"
+
+
+bool		X86Features[X86FeaturesSize] = {0};
 
 /*
  * Does XGETBV say the ZMM registers are enabled?
@@ -56,22 +54,13 @@ zmm_regs_available(void)
 }
 
 /*
- * This gets called on the first call. It replaces the function pointer
- * so that subsequent calls are routed directly to the chosen implementation.
+ * Parse the CPU ID info for runtime checks.
  */
-static pg_crc32c
-pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
+void
+set_x86_features(void)
 {
 	unsigned int exx[4] = {0, 0, 0, 0};
 
-	/*
-	 * Set fallback. We must guard since slicing-by-8 is not visible
-	 * everywhere.
-	 */
-#ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
-	pg_comp_crc32c = pg_comp_crc32c_sb8;
-#endif
-
 #if defined(HAVE__GET_CPUID)
 	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
 #elif defined(HAVE__CPUID)
@@ -80,34 +69,33 @@ pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
 #error cpuid instruction not available
 #endif
 
-	if ((exx[2] & (1 << 20)) != 0)	/* SSE 4.2 */
-	{
-		pg_comp_crc32c = pg_comp_crc32c_sse42;
+	X86Features[PG_SSE4_2] = exx[2] >> 20 & 1;
+	X86Features[PG_POPCNT] = exx[2] >> 23 & 1;
 
-		if (exx[2] & (1 << 27) &&	/* OSXSAVE */
-			zmm_regs_available())
-		{
-			/* second cpuid call on leaf 7 to check extended AVX-512 support */
+	/* All these features depend on OSXSAVE */
+	if (exx[2] & (1 << 27))
+	{
+		/* second cpuid call on leaf 7 to check extended AVX-512 support */
 
-			memset(exx, 0, 4 * sizeof(exx[0]));
+		memset(exx, 0, 4 * sizeof(exx[0]));
 
 #if defined(HAVE__GET_CPUID_COUNT)
-			__get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
+		__get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
 #elif defined(HAVE__CPUIDEX)
-			__cpuidex(exx, 7, 0);
+		__cpuidex(exx, 7, 0);
 #endif
 
-#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
-			if (exx[2] & (1 << 10) &&	/* VPCLMULQDQ */
-				exx[1] & (1 << 31)) /* AVX512-VL */
-				pg_comp_crc32c = pg_comp_crc32c_avx512;
-#endif
+		if (zmm_regs_available())
+		{
+			X86Features[PG_AVX512_BW] = exx[1] >> 30 & 1;
+			X86Features[PG_AVX512_VL] = exx[1] >> 31 & 1;
+
+			X86Features[PG_VPCLMULQDQ] = exx[2] >> 10 & 1;
+			X86Features[PG_AVX512_VPOPCNTDQ] = exx[2] >> 14 & 1;
 		}
 	}
 
-	return pg_comp_crc32c(crc, data, len);
+	X86Features[INIT_PG_X86] = true;
 }
 
-pg_crc32c	(*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose;
-
 #endif							/* defined(USE_SSE2) || defined(__i386__) */
diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index c1279d31fbd..2e740e12a7a 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -20,6 +20,9 @@
 #endif
 
 #include "port/pg_crc32c.h"
+#include "port/pg_cpu.h"
+
+static pg_crc32c pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len);
 
 pg_attribute_no_sanitize_alignment()
 pg_attribute_target("sse4.2")
@@ -159,3 +162,28 @@ pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t len)
 }
 
 #endif
+
+static pg_crc32c
+pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
+{
+	/*
+	 * Set fallback. We must guard since slicing-by-8 is not visible
+	 * everywhere.
+	 */
+#ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
+	pg_comp_crc32c = pg_comp_crc32c_sb8;
+#endif
+
+	if (x86_feature_available(PG_SSE4_2))
+		pg_comp_crc32c = pg_comp_crc32c_sse42;
+
+#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
+	if (x86_feature_available(PG_AVX512_VL) &&
+		x86_feature_available(PG_VPCLMULQDQ))
+		pg_comp_crc32c = pg_comp_crc32c_avx512;
+#endif
+
+	return pg_comp_crc32c(crc, data, len);
+};
+
+pg_crc32c	(*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose;
diff --git a/src/port/pg_popcount_x86.c b/src/port/pg_popcount_x86.c
index 6bce089432f..a99613f1818 100644
--- a/src/port/pg_popcount_x86.c
+++ b/src/port/pg_popcount_x86.c
@@ -14,19 +14,12 @@
 
 #ifdef HAVE_X86_64_POPCNTQ
 
-#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
-#include <cpuid.h>
-#endif
-
 #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
 #include <immintrin.h>
 #endif
 
-#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
-#include <intrin.h>
-#endif
-
 #include "port/pg_bitutils.h"
+#include "port/pg_cpu.h"
 
 /*
  * The SSE4.2 versions are built regardless of whether we are building the
@@ -58,84 +51,9 @@ static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask);
 uint64		(*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose;
 uint64		(*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose;
 
-/*
- * Return true if CPUID indicates that the POPCNT instruction is available.
- */
-static bool
-pg_popcount_sse42_available(void)
-{
-	unsigned int exx[4] = {0, 0, 0, 0};
-
-#if defined(HAVE__GET_CPUID)
-	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
-#elif defined(HAVE__CPUID)
-	__cpuid(exx, 1);
-#else
-#error cpuid instruction not available
-#endif
-
-	return (exx[2] & (1 << 23)) != 0;	/* POPCNT */
-}
 
 #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
 
-/*
- * Does CPUID say there's support for XSAVE instructions?
- */
-static inline bool
-xsave_available(void)
-{
-	unsigned int exx[4] = {0, 0, 0, 0};
-
-#if defined(HAVE__GET_CPUID)
-	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
-#elif defined(HAVE__CPUID)
-	__cpuid(exx, 1);
-#else
-#error cpuid instruction not available
-#endif
-	return (exx[2] & (1 << 27)) != 0;	/* osxsave */
-}
-
-/*
- * Does XGETBV say the ZMM registers are enabled?
- *
- * NB: Caller is responsible for verifying that xsave_available() returns true
- * before calling this.
- */
-#ifdef HAVE_XSAVE_INTRINSICS
-pg_attribute_target("xsave")
-#endif
-static inline bool
-zmm_regs_available(void)
-{
-#ifdef HAVE_XSAVE_INTRINSICS
-	return (_xgetbv(0) & 0xe6) == 0xe6;
-#else
-	return false;
-#endif
-}
-
-/*
- * Does CPUID say there's support for AVX-512 popcount and byte-and-word
- * instructions?
- */
-static inline bool
-avx512_popcnt_available(void)
-{
-	unsigned int exx[4] = {0, 0, 0, 0};
-
-#if defined(HAVE__GET_CPUID_COUNT)
-	__get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
-#elif defined(HAVE__CPUIDEX)
-	__cpuidex(exx, 7, 0);
-#else
-#error cpuid instruction not available
-#endif
-	return (exx[2] & (1 << 14)) != 0 && /* avx512-vpopcntdq */
-		(exx[1] & (1 << 30)) != 0;	/* avx512-bw */
-}
-
 /*
  * Returns true if the CPU supports the instructions required for the AVX-512
  * pg_popcount() implementation.
@@ -143,9 +61,8 @@ avx512_popcnt_available(void)
 static bool
 pg_popcount_avx512_available(void)
 {
-	return xsave_available() &&
-		zmm_regs_available() &&
-		avx512_popcnt_available();
+	return x86_feature_available(PG_AVX512_BW) &&
+		x86_feature_available(PG_AVX512_VPOPCNTDQ);
 }
 
 #endif							/* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
@@ -159,7 +76,7 @@ pg_popcount_avx512_available(void)
 static inline void
 choose_popcount_functions(void)
 {
-	if (pg_popcount_sse42_available())
+	if (x86_feature_available(PG_POPCNT))
 	{
 		pg_popcount_optimized = pg_popcount_sse42;
 		pg_popcount_masked_optimized = pg_popcount_masked_sse42;
-- 
2.53.0

From 7ea446c6cf79c7f5924b601b9e10d0de2c069c14 Mon Sep 17 00:00:00 2001
From: John Naylor <[email protected]>
Date: Wed, 11 Feb 2026 14:34:18 +0700
Subject: [PATCH v4 1/6] Rename CRC "choose" files for future general purpose

WIP: x86 only
---
 configure                                           | 4 ++--
 configure.ac                                        | 4 ++--
 src/port/Makefile                                   | 1 +
 src/port/meson.build                                | 3 +--
 src/port/{pg_crc32c_sse42_choose.c => pg_cpu_x86.c} | 8 ++++++--
 5 files changed, 12 insertions(+), 8 deletions(-)
 rename src/port/{pg_crc32c_sse42_choose.c => pg_cpu_x86.c} (94%)

diff --git a/configure b/configure
index a10a2c85c6a..185703289b4 100755
--- a/configure
+++ b/configure
@@ -18196,7 +18196,7 @@ if test x"$USE_SSE42_CRC32C" = x"1"; then
 
 $as_echo "#define USE_SSE42_CRC32C 1" >>confdefs.h
 
-  PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o"
+  PG_CRC32C_OBJS="pg_crc32c_sse42.o"
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5
 $as_echo "SSE 4.2" >&6; }
 else
@@ -18204,7 +18204,7 @@ else
 
 $as_echo "#define USE_SSE42_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
 
-    PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_crc32c_sse42_choose.o"
+    PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o"
     { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5
 $as_echo "SSE 4.2 with runtime check" >&6; }
   else
diff --git a/configure.ac b/configure.ac
index 814e64a967e..0955b7e4371 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2245,12 +2245,12 @@ fi
 AC_MSG_CHECKING([which CRC-32C implementation to use])
 if test x"$USE_SSE42_CRC32C" = x"1"; then
   AC_DEFINE(USE_SSE42_CRC32C, 1, [Define to 1 use Intel SSE 4.2 CRC instructions.])
-  PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o"
+  PG_CRC32C_OBJS="pg_crc32c_sse42.o"
   AC_MSG_RESULT(SSE 4.2)
 else
   if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
     AC_DEFINE(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check.])
-    PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_crc32c_sse42_choose.o"
+    PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o"
     AC_MSG_RESULT(SSE 4.2 with runtime check)
   else
     if test x"$USE_ARMV8_CRC32C" = x"1"; then
diff --git a/src/port/Makefile b/src/port/Makefile
index 6e3b7d154ed..47cfea1507d 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -44,6 +44,7 @@ OBJS = \
 	noblock.o \
 	path.o \
 	pg_bitutils.o \
+	pg_cpu_x86.o \
 	pg_localeconv_r.o \
 	pg_numa.o \
 	pg_popcount_aarch64.o \
diff --git a/src/port/meson.build b/src/port/meson.build
index d7d4e705b89..edb2e5632bd 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -7,6 +7,7 @@ pgport_sources = [
   'noblock.c',
   'path.c',
   'pg_bitutils.c',
+  'pg_cpu_x86.c',
   'pg_localeconv_r.c',
   'pg_numa.c',
   'pg_popcount_aarch64.c',
@@ -86,8 +87,6 @@ replace_funcs_pos = [
   # x86/x64
   ['pg_crc32c_sse42', 'USE_SSE42_CRC32C'],
   ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
-  ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C'],
-  ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
   ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
 
   # arm / aarch64
diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_cpu_x86.c
similarity index 94%
rename from src/port/pg_crc32c_sse42_choose.c
rename to src/port/pg_cpu_x86.c
index f586476964f..998a70ffa41 100644
--- a/src/port/pg_crc32c_sse42_choose.c
+++ b/src/port/pg_cpu_x86.c
@@ -1,6 +1,6 @@
 /*-------------------------------------------------------------------------
  *
- * pg_crc32c_sse42_choose.c
+ * pg_cpu_x86.c
  *	  Choose between Intel SSE 4.2 and software CRC-32C implementation.
  *
  * On first call, checks if the CPU we're running on supports Intel SSE
@@ -13,13 +13,15 @@
  *
  *
  * IDENTIFICATION
- *	  src/port/pg_crc32c_sse42_choose.c
+ *	  src/port/pg_cpu_x86.c
  *
  *-------------------------------------------------------------------------
  */
 
 #include "c.h"
 
+#if defined(USE_SSE2) || defined(__i386__)
+
 #if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
 #include <cpuid.h>
 #endif
@@ -107,3 +109,5 @@ pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
 }
 
 pg_crc32c	(*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose;
+
+#endif							/* defined(USE_SSE2) || defined(__i386__) */
-- 
2.53.0

From 03e6fdd45f4a9bc1bd045feff2ee395c45cc9a88 Mon Sep 17 00:00:00 2001
From: John Naylor <[email protected]>
Date: Fri, 13 Feb 2026 18:11:39 +0700
Subject: [PATCH v4 3/6] Refactor the detection of ZMM registers

- Call _xgetbv within x86_set_runtime_features rather than in a
  separate function
- Use symbols for XCR mask bits rather than a magic constant

A future commit will build on this to detect YMM registers without
code duplication.
---
 src/port/pg_cpu_x86.c      | 42 +++++++++++++++++++++-----------------
 src/port/pg_crc32c_sse42.c |  2 +-
 2 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/src/port/pg_cpu_x86.c b/src/port/pg_cpu_x86.c
index 725bd1f68c8..9b462b186b8 100644
--- a/src/port/pg_cpu_x86.c
+++ b/src/port/pg_cpu_x86.c
@@ -31,31 +31,28 @@
 
 #include "port/pg_cpu.h"
 
+/* XSAVE state component bits that we need */
+#define XMM (1<<1)
+#define YMM  (1<<2)
+#define OPMASK  (1<<5)
+#define ZMM0_15  (1<<6)
+#define ZMM16_31  (1<<7)
+
 
 bool		X86Features[X86FeaturesSize] = {0};
 
-/*
- * Does XGETBV say the ZMM registers are enabled?
- *
- * NB: Caller is responsible for verifying that osxsave is available
- * before calling this.
- */
-#ifdef HAVE_XSAVE_INTRINSICS
-pg_attribute_target("xsave")
-#endif
 static bool
-zmm_regs_available(void)
+mask_available(uint32 value, uint32 mask)
 {
-#ifdef HAVE_XSAVE_INTRINSICS
-	return (_xgetbv(0) & 0xe6) == 0xe6;
-#else
-	return false;
-#endif
+	return (value & mask) == mask;
 }
 
 /*
  * Parse the CPU ID info for runtime checks.
  */
+#ifdef HAVE_XSAVE_INTRINSICS
+pg_attribute_target("xsave")
+#endif
 void
 set_x86_features(void)
 {
@@ -75,22 +72,29 @@ set_x86_features(void)
 	/* All these features depend on OSXSAVE */
 	if (exx[2] & (1 << 27))
 	{
-		/* second cpuid call on leaf 7 to check extended AVX-512 support */
+		uint32		xcr0_val = 0;
 
+		/* second cpuid call on leaf 7 to check extended AVX-512 support */
 		memset(exx, 0, 4 * sizeof(exx[0]));
-
 #if defined(HAVE__GET_CPUID_COUNT)
 		__get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
 #elif defined(HAVE__CPUIDEX)
 		__cpuidex(exx, 7, 0);
 #endif
 
-		if (zmm_regs_available())
+#ifdef HAVE_XSAVE_INTRINSICS
+		/* get value of Extended Control Register */
+		xcr0_val = _xgetbv(0);
+#endif
+
+		/* Are ZMM registeres enabled? */
+		if (mask_available(xcr0_val, XMM | YMM |
+						   OPMASK | ZMM0_15 | ZMM16_31))
 		{
 			X86Features[PG_AVX512_BW] = exx[1] >> 30 & 1;
 			X86Features[PG_AVX512_VL] = exx[1] >> 31 & 1;
 
-			X86Features[PG_VPCLMULQDQ] = exx[2] >> 10 & 1;
+			X86Features[PG_AVX512_VPCLMULQDQ] = exx[2] >> 10 & 1;
 			X86Features[PG_AVX512_VPOPCNTDQ] = exx[2] >> 14 & 1;
 		}
 	}
diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index 2e740e12a7a..d1d9d74e5ab 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -179,7 +179,7 @@ pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
 
 #ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
 	if (x86_feature_available(PG_AVX512_VL) &&
-		x86_feature_available(PG_VPCLMULQDQ))
+		x86_feature_available(PG_AVX512_VPCLMULQDQ))
 		pg_comp_crc32c = pg_comp_crc32c_avx512;
 #endif
 
-- 
2.53.0

From 3d186bc8be218c609ff9005d88ea6222b3af4445 Mon Sep 17 00:00:00 2001
From: John Naylor <[email protected]>
Date: Sat, 14 Feb 2026 19:01:34 +0700
Subject: [PATCH v4 4/6] Enable autovectorizing page checksums with AVX2 where
 available

We already rely on autovectorization for computing page checksums,
but on x86 we can get about twice the performance by annotating
pg_checksum_block() with function target attributes for AVX2,
which uses 256-bit registers.

Co-authored-by: Matthew Sterrett <[email protected]>
Co-authored-by: Andrew Kim <[email protected]>
Reviewed-by: Oleg Tselebrovskiy <[email protected]>
Discussion: https://postgr.es/m/CA%2BvA85_5GTu%2BHHniSbvvP%2B8k3%3DxZO%3DWE84NPwiKyxztqvpfZ3Q%40mail.gmail.com
Discussion: https://postgr.es/m/20250911054220.3784-1-root%40ip-172-31-36-228.ec2.internal
---
 config/c-compiler.m4                          | 26 ++++++++++
 configure                                     | 46 ++++++++++++++++++
 configure.ac                                  |  9 ++++
 meson.build                                   | 30 ++++++++++++
 src/backend/storage/page/checksum.c           | 44 ++++++++++++++++-
 src/include/pg_config.h.in                    |  3 ++
 src/include/port/pg_cpu.h                     |  3 ++
 src/include/storage/checksum_block_internal.h | 42 ++++++++++++++++
 src/include/storage/checksum_impl.h           | 48 ++++++-------------
 src/port/pg_cpu_x86.c                         |  6 ++-
 src/tools/pginclude/headerscheck              |  2 +
 11 files changed, 224 insertions(+), 35 deletions(-)
 create mode 100644 src/include/storage/checksum_block_internal.h

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 1509dbfa2ab..1f3e31fc2d3 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -613,6 +613,32 @@ fi
 undefine([Ac_cachevar])dnl
 ])# PGAC_SSE42_CRC32_INTRINSICS
 
+# PGAC_AVX2_SUPPORT
+# ---------------------------
+# Check if the compiler supports AVX2 target attribute.
+# This is used for optimized checksum calculations with runtime detection.
+#
+# If AVX2 target attribute is supported, sets pgac_avx2_support.
+AC_DEFUN([PGAC_AVX2_SUPPORT],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx2_support])])dnl
+AC_CACHE_CHECK([for AVX2 target attribute support], [Ac_cachevar],
+[AC_COMPILE_IFELSE([AC_LANG_PROGRAM([#include <stdint.h>
+    #if defined(__has_attribute) && __has_attribute (target)
+    __attribute__((target("avx2")))
+    static int avx2_test(void)
+    {
+      return 0;
+    }
+    #endif],
+  [return avx2_test();])],
+  [Ac_cachevar=yes],
+  [Ac_cachevar=no])])
+if test x"$Ac_cachevar" = x"yes"; then
+  pgac_avx2_support=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_AVX2_SUPPORT
+
 # PGAC_AVX512_PCLMUL_INTRINSICS
 # ---------------------------
 # Check if the compiler supports AVX-512 carryless multiplication
diff --git a/configure b/configure
index 185703289b4..2d2c6308005 100755
--- a/configure
+++ b/configure
@@ -17718,6 +17718,52 @@ $as_echo "#define HAVE__CPUIDEX 1" >>confdefs.h
   fi
 fi
 
+# Check for AVX2 target and intrinsic support
+#
+if test x"$host_cpu" = x"x86_64"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for AVX2 target attribute support" >&5
+$as_echo_n "checking for AVX2 target attribute support... " >&6; }
+if ${pgac_cv_avx2_support+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdint.h>
+    #if defined(__has_attribute) && __has_attribute (target)
+    __attribute__((target("avx2")))
+    static int avx2_test(void)
+    {
+      return 0;
+    }
+    #endif
+int
+main ()
+{
+return avx2_test();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  pgac_cv_avx2_support=yes
+else
+  pgac_cv_avx2_support=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx2_support" >&5
+$as_echo "$pgac_cv_avx2_support" >&6; }
+if test x"$pgac_cv_avx2_support" = x"yes"; then
+  pgac_avx2_support=yes
+fi
+
+  if test x"$pgac_avx2_support" = x"yes"; then
+
+$as_echo "#define USE_AVX2_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+  fi
+fi
+
 # Check for XSAVE intrinsics
 #
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _xgetbv" >&5
diff --git a/configure.ac b/configure.ac
index 0955b7e4371..0b4c3970b68 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2122,6 +2122,15 @@ else
   fi
 fi
 
+# Check for AVX2 target and intrinsic support
+#
+if test x"$host_cpu" = x"x86_64"; then
+  PGAC_AVX2_SUPPORT()
+  if test x"$pgac_avx2_support" = x"yes"; then
+    AC_DEFINE(USE_AVX2_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX2 instructions with a runtime check.])
+  fi
+fi
+
 # Check for XSAVE intrinsics
 #
 PGAC_XSAVE_INTRINSICS()
diff --git a/meson.build b/meson.build
index f6d5842d852..feea3658ff3 100644
--- a/meson.build
+++ b/meson.build
@@ -2377,6 +2377,36 @@ int main(void)
 endif
 
 
+###############################################################
+# Check for the availability of AVX2 support
+###############################################################
+
+if host_cpu == 'x86_64'
+
+  prog = '''
+#include <immintrin.h>
+#include <stdint.h>
+#if defined(__has_attribute) && __has_attribute (target)
+__attribute__((target("avx2")))
+#endif
+static int avx2_test(void)
+{
+    return 0;
+}
+
+int main(void)
+{
+    return avx2_test();
+}
+'''
+
+  if cc.links(prog, name: 'AVX2 support', args: test_c_args)
+    cdata.set('USE_AVX2_WITH_RUNTIME_CHECK', 1)
+  endif
+
+endif
+
+
 ###############################################################
 # Check for the availability of AVX-512 popcount intrinsics.
 ###############################################################
diff --git a/src/backend/storage/page/checksum.c b/src/backend/storage/page/checksum.c
index 8716651c8b5..030c44f7308 100644
--- a/src/backend/storage/page/checksum.c
+++ b/src/backend/storage/page/checksum.c
@@ -13,10 +13,52 @@
  */
 #include "postgres.h"
 
+#include "port/pg_cpu.h"
 #include "storage/checksum.h"
 /*
  * The actual code is in storage/checksum_impl.h.  This is done so that
  * external programs can incorporate the checksum code by #include'ing
- * that file from the exported Postgres headers.  (Compare our CRC code.)
+ * that file from the exported Postgres headers.  (Compare our legacy
+ * CRC code in pg_crc.h.)
+ * The PG_CHECKSUM_INTERNAL symbol allows core to use hardware-specific
+ * coding without affecting external programs.
  */
+#define PG_CHECKSUM_INTERNAL
 #include "storage/checksum_impl.h"	/* IWYU pragma: keep */
+
+
+static uint32
+pg_checksum_block_fallback(const PGChecksummablePage *page)
+{
+#include "storage/checksum_block_internal.h"
+}
+
+/*
+ * AVX2-optimized block checksum algorithm.
+ */
+#ifdef USE_AVX2_WITH_RUNTIME_CHECK
+pg_attribute_target("avx2")
+static uint32
+pg_checksum_block_avx2(const PGChecksummablePage *page)
+{
+#include "storage/checksum_block_internal.h"
+}
+#endif							/* USE_AVX2_WITH_RUNTIME_CHECK */
+
+/*
+ * Choose the best available checksum implementation.
+ */
+static uint32
+pg_checksum_choose(const PGChecksummablePage *page)
+{
+	pg_checksum_block = pg_checksum_block_fallback;
+
+#ifdef USE_AVX2_WITH_RUNTIME_CHECK
+	if (x86_feature_available(PG_AVX2))
+		pg_checksum_block = pg_checksum_block_avx2;
+#endif
+
+	return pg_checksum_block(page);
+}
+
+static uint32 (*pg_checksum_block) (const PGChecksummablePage *page) = pg_checksum_choose;
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 339268dc8ef..1e43e9b2bc4 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -665,6 +665,9 @@
 /* Define to 1 to use AVX-512 CRC algorithms with a runtime check. */
 #undef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
 
+/* Define to 1 to use AVX2 instructions with a runtime check. */
+#undef USE_AVX2_WITH_RUNTIME_CHECK
+
 /* Define to 1 to use AVX-512 popcount instructions with a runtime check. */
 #undef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
 
diff --git a/src/include/port/pg_cpu.h b/src/include/port/pg_cpu.h
index c9e03d016e9..3c70fd43a23 100644
--- a/src/include/port/pg_cpu.h
+++ b/src/include/port/pg_cpu.h
@@ -24,6 +24,9 @@ typedef enum X86FeatureId
 	PG_SSE4_2,
 	PG_POPCNT,
 
+	/* 256-bit registers */
+	PG_AVX2,
+
 	/* 512-bit registers */
 	PG_AVX512_BW,
 	PG_AVX512_VL,
diff --git a/src/include/storage/checksum_block_internal.h b/src/include/storage/checksum_block_internal.h
new file mode 100644
index 00000000000..b4e6987d6b5
--- /dev/null
+++ b/src/include/storage/checksum_block_internal.h
@@ -0,0 +1,42 @@
+/*-------------------------------------------------------------------------
+ *
+ * checksum_block_internal.h
+ *	  Core algorithm for page checksums , semi private to checksum_impl.h
+ *	  and checksum.c.
+ *
+ * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/checksum_block_internal.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/* there is deliberately not an #ifndef CHECKSUM_BLOCK_INTERNAL_H here */
+
+uint32		sums[N_SUMS];
+uint32		result = 0;
+uint32		i,
+			j;
+
+/* ensure that the size is compatible with the algorithm */
+Assert(sizeof(PGChecksummablePage) == BLCKSZ);
+
+/* initialize partial checksums to their corresponding offsets */
+memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets));
+
+/* main checksum calculation */
+for (i = 0; i < (uint32) (BLCKSZ / (sizeof(uint32) * N_SUMS)); i++)
+	for (j = 0; j < N_SUMS; j++)
+		CHECKSUM_COMP(sums[j], page->data[i][j]);
+
+/* finally add in two rounds of zeroes for additional mixing */
+for (i = 0; i < 2; i++)
+	for (j = 0; j < N_SUMS; j++)
+		CHECKSUM_COMP(sums[j], 0);
+
+/* xor fold partial checksums together */
+for (i = 0; i < N_SUMS; i++)
+	result ^= sums[i];
+
+return result;
diff --git a/src/include/storage/checksum_impl.h b/src/include/storage/checksum_impl.h
index 5c2dcbc63e7..8a308e423c3 100644
--- a/src/include/storage/checksum_impl.h
+++ b/src/include/storage/checksum_impl.h
@@ -73,11 +73,10 @@
  * 2e-16 false positive rate within margin of error.
  *
  * Vectorization of the algorithm requires 32bit x 32bit -> 32bit integer
- * multiplication instruction. As of 2013 the corresponding instruction is
- * available on x86 SSE4.1 extensions (pmulld) and ARM NEON (vmul.i32).
- * Vectorization requires a compiler to do the vectorization for us. For recent
- * GCC versions the flags -msse4.1 -funroll-loops -ftree-vectorize are enough
- * to achieve vectorization.
+ * multiplication instruction. Examples include x86 AVX2 extensions (vpmulld)
+ * and ARM NEON (vmul.i32). For simplicity we rely on the compiler to do the
+ * vectorization for us. For GCC and clang the flags -funroll-loops
+ * -ftree-vectorize are enough to achieve vectorization.
  *
  * The optimal amount of parallelism to use depends on CPU specific instruction
  * latency, SIMD instruction width, throughput and the amount of registers
@@ -89,8 +88,9 @@
  *
  * The parallelism number 32 was chosen based on the fact that it is the
  * largest state that fits into architecturally visible x86 SSE registers while
- * leaving some free registers for intermediate values. For future processors
- * with 256bit vector registers this will leave some performance on the table.
+ * leaving some free registers for intermediate values. For processors
+ * with 256bit vector registers this leaves some performance on the table.
+ *
  * When vectorization is not available it might be beneficial to restructure
  * the computation to calculate a subset of the columns at a time and perform
  * multiple passes to avoid register spilling. This optimization opportunity
@@ -138,6 +138,9 @@ do { \
 	(checksum) = __tmp * FNV_PRIME ^ (__tmp >> 17); \
 } while (0)
 
+/* Provide a static definition for external programs */
+#ifndef PG_CHECKSUM_INTERNAL
+
 /*
  * Block checksum algorithm.  The page must be adequately aligned
  * (at least on 4-byte boundary).
@@ -145,34 +148,13 @@ do { \
 static uint32
 pg_checksum_block(const PGChecksummablePage *page)
 {
-	uint32		sums[N_SUMS];
-	uint32		result = 0;
-	uint32		i,
-				j;
-
-	/* ensure that the size is compatible with the algorithm */
-	Assert(sizeof(PGChecksummablePage) == BLCKSZ);
-
-	/* initialize partial checksums to their corresponding offsets */
-	memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets));
-
-	/* main checksum calculation */
-	for (i = 0; i < (uint32) (BLCKSZ / (sizeof(uint32) * N_SUMS)); i++)
-		for (j = 0; j < N_SUMS; j++)
-			CHECKSUM_COMP(sums[j], page->data[i][j]);
-
-	/* finally add in two rounds of zeroes for additional mixing */
-	for (i = 0; i < 2; i++)
-		for (j = 0; j < N_SUMS; j++)
-			CHECKSUM_COMP(sums[j], 0);
-
-	/* xor fold partial checksums together */
-	for (i = 0; i < N_SUMS; i++)
-		result ^= sums[i];
-
-	return result;
+#include "storage/checksum_block_internal.h"
 }
 
+#else
+static uint32 (*pg_checksum_block) (const PGChecksummablePage *page);
+#endif
+
 /*
  * Compute the checksum for a Postgres page.
  *
diff --git a/src/port/pg_cpu_x86.c b/src/port/pg_cpu_x86.c
index 9b462b186b8..a48c39ade98 100644
--- a/src/port/pg_cpu_x86.c
+++ b/src/port/pg_cpu_x86.c
@@ -74,7 +74,7 @@ set_x86_features(void)
 	{
 		uint32		xcr0_val = 0;
 
-		/* second cpuid call on leaf 7 to check extended AVX-512 support */
+		/* second cpuid call on leaf 7 to check extended support */
 		memset(exx, 0, 4 * sizeof(exx[0]));
 #if defined(HAVE__GET_CPUID_COUNT)
 		__get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
@@ -87,6 +87,10 @@ set_x86_features(void)
 		xcr0_val = _xgetbv(0);
 #endif
 
+		/* Are YMM registers enabled? */
+		if (mask_available(xcr0_val, XMM | YMM))
+			X86Features[PG_AVX2] = exx[1] >> 5 & 1;
+
 		/* Are ZMM registeres enabled? */
 		if (mask_available(xcr0_val, XMM | YMM |
 						   OPMASK | ZMM0_15 | ZMM16_31))
diff --git a/src/tools/pginclude/headerscheck b/src/tools/pginclude/headerscheck
index 7a6755991bb..569e749b25a 100755
--- a/src/tools/pginclude/headerscheck
+++ b/src/tools/pginclude/headerscheck
@@ -154,6 +154,8 @@ do
 	test "$f" = src/include/catalog/syscache_ids.h && continue
 	test "$f" = src/include/catalog/syscache_info.h && continue
 
+	test "$f" = src/include/storage/checksum_block_internal.h && continue
+
 	# We can't make these Bison output files compilable standalone
 	# without using "%code require", which old Bison versions lack.
 	# parser/gram.h will be included by parser/gramparse.h anyway.
-- 
2.53.0

From 58d1b495afc0791b6ab4dfaa421cee2bc09d5359 Mon Sep 17 00:00:00 2001
From: John Naylor <[email protected]>
Date: Thu, 19 Feb 2026 15:04:35 +0700
Subject: [PATCH v4 5/6] Rename CRC Arm-v8 "choose" file for future general
 purpose

---
 configure                                             | 2 +-
 configure.ac                                          | 2 +-
 src/port/Makefile                                     | 1 +
 src/port/meson.build                                  | 1 +
 src/port/{pg_crc32c_armv8_choose.c => pg_cpu_armv8.c} | 8 ++++++--
 5 files changed, 10 insertions(+), 4 deletions(-)
 rename src/port/{pg_crc32c_armv8_choose.c => pg_cpu_armv8.c} (95%)

diff --git a/configure b/configure
index 2d2c6308005..ec76278f9c1 100755
--- a/configure
+++ b/configure
@@ -18266,7 +18266,7 @@ $as_echo "ARMv8 CRC instructions" >&6; }
 
 $as_echo "#define USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
 
-        PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o"
+        PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o"
         { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions with runtime check" >&5
 $as_echo "ARMv8 CRC instructions with runtime check" >&6; }
       else
diff --git a/configure.ac b/configure.ac
index 0b4c3970b68..5a3971bad63 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2269,7 +2269,7 @@ else
     else
       if test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
         AC_DEFINE(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use ARMv8 CRC Extension with a runtime check.])
-        PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o"
+        PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o"
         AC_MSG_RESULT(ARMv8 CRC instructions with runtime check)
       else
         if test x"$USE_LOONGARCH_CRC32C" = x"1"; then
diff --git a/src/port/Makefile b/src/port/Makefile
index 47cfea1507d..4ed7e1902fb 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -44,6 +44,7 @@ OBJS = \
 	noblock.o \
 	path.o \
 	pg_bitutils.o \
+	pg_cpu_armv8.o \
 	pg_cpu_x86.o \
 	pg_localeconv_r.o \
 	pg_numa.o \
diff --git a/src/port/meson.build b/src/port/meson.build
index edb2e5632bd..0b26218be7a 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -7,6 +7,7 @@ pgport_sources = [
   'noblock.c',
   'path.c',
   'pg_bitutils.c',
+  'pg_cpu_armv8.c',
   'pg_cpu_x86.c',
   'pg_localeconv_r.c',
   'pg_numa.c',
diff --git a/src/port/pg_crc32c_armv8_choose.c b/src/port/pg_cpu_armv8.c
similarity index 95%
rename from src/port/pg_crc32c_armv8_choose.c
rename to src/port/pg_cpu_armv8.c
index a1f0e540c6b..6c22704b5fa 100644
--- a/src/port/pg_crc32c_armv8_choose.c
+++ b/src/port/pg_cpu_armv8.c
@@ -1,6 +1,6 @@
 /*-------------------------------------------------------------------------
  *
- * pg_crc32c_armv8_choose.c
+ * pg_cpu_armv8.c
  *	  Choose between ARMv8 and software CRC-32C implementation.
  *
  * On first call, checks if the CPU we're running on supports the ARMv8
@@ -13,7 +13,7 @@
  *
  *
  * IDENTIFICATION
- *	  src/port/pg_crc32c_armv8_choose.c
+ *	  src/port/pg_cpu_armv8.c
  *
  *-------------------------------------------------------------------------
  */
@@ -24,6 +24,8 @@
 #include "postgres_fe.h"
 #endif
 
+#if defined(__arm__) || defined(__arm) || defined(__aarch64__)
+
 #if defined(HAVE_ELF_AUX_INFO) || defined(HAVE_GETAUXVAL)
 #include <sys/auxv.h>
 /* Ancient glibc releases don't include the HWCAPxxx macros in sys/auxv.h */
@@ -124,3 +126,5 @@ pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
 }
 
 pg_crc32c	(*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose;
+
+#endif      /* __arm__ || __arm || __aarch64__ */
-- 
2.53.0

From a3e9a1302d710f9fa6c48594144a82fe0c3988d6 Mon Sep 17 00:00:00 2001
From: John Naylor <[email protected]>
Date: Thu, 19 Feb 2026 18:27:02 +0700
Subject: [PATCH v4 6/6] Centralize detection of CPU features Arm, take 1

---
 src/include/port/pg_cpu.h      | 25 ++++++++++++
 src/port/pg_cpu_armv8.c        | 74 +++++++++++++++-------------------
 src/port/pg_crc32c_armv8.c     | 20 +++++++++
 src/port/pg_popcount_aarch64.c | 26 +-----------
 4 files changed, 80 insertions(+), 65 deletions(-)

diff --git a/src/include/port/pg_cpu.h b/src/include/port/pg_cpu.h
index 3c70fd43a23..3687e025083 100644
--- a/src/include/port/pg_cpu.h
+++ b/src/include/port/pg_cpu.h
@@ -48,6 +48,31 @@ x86_feature_available(X86FeatureId feature)
 	return X86Features[feature];
 }
 
+#elif defined(__arm__) || defined(__arm) || defined(__aarch64__)
+
+typedef enum ArmFeatureId
+{
+	/* Have we run feature detection? */
+	INIT_PG_ARM,
+
+	PG_ARM_CRC32,
+	PG_ARM_SVE,
+}			ArmFeatureId;
+#define ArmFeaturesSize (PG_ARM_SVE + 1)
+
+extern PGDLLIMPORT bool ArmFeatures[];
+
+extern void set_arm_features(void);
+
+static inline bool
+arm_feature_available(ArmFeatureId feature)
+{
+	if (ArmFeatures[INIT_PG_ARM] == false)
+		set_arm_features();
+
+	return ArmFeatures[feature];
+}
+
 #endif							/* defined(USE_SSE2) || defined(__i386__) */
 
 #endif							/* PG_CPU_H */
diff --git a/src/port/pg_cpu_armv8.c b/src/port/pg_cpu_armv8.c
index 6c22704b5fa..59a9229b71d 100644
--- a/src/port/pg_cpu_armv8.c
+++ b/src/port/pg_cpu_armv8.c
@@ -1,12 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * pg_cpu_armv8.c
- *	  Choose between ARMv8 and software CRC-32C implementation.
- *
- * On first call, checks if the CPU we're running on supports the ARMv8
- * CRC Extension. If it does, use the special instructions for CRC-32C
- * computation. Otherwise, fall back to the pure software implementation
- * (slicing-by-8).
+ *	  Runtime CPU feature detection for Arm-v8
  *
  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
@@ -41,27 +36,41 @@
 #endif
 #endif
 
-#include "port/pg_crc32c.h"
+#include "port/pg_cpu.h"
+
+
+bool		ArmFeatures[ArmFeaturesSize] = {0};
 
-static bool
-pg_crc32c_armv8_available(void)
+static inline unsigned long
+pg_getauxval(unsigned long at_hwcap)
 {
 #if defined(HAVE_ELF_AUX_INFO)
 	unsigned long value;
 
-#ifdef __aarch64__
-	return elf_aux_info(AT_HWCAP, &value, sizeof(value)) == 0 &&
-		(value & HWCAP_CRC32) != 0;
-#else
-	return elf_aux_info(AT_HWCAP2, &value, sizeof(value)) == 0 &&
-		(value & HWCAP2_CRC32) != 0;
-#endif
+	if (elf_aux_info(at_hwcap, &value, sizeof(value)) == 0)
+		return value;
+	else
+		return 0;
 #elif defined(HAVE_GETAUXVAL)
+	return getauxval(at_hwcap);
+#endif
+}
+
+void
+set_arm_features(void)
+{
+#if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO
 #ifdef __aarch64__
-	return (getauxval(AT_HWCAP) & HWCAP_CRC32) != 0;
+	unsigned long hwcap = pg_getauxval(AT_HWCAP);
+
+	ArmFeatures[PG_ARM_CRC32] = (hwcap & HWCAP_CRC32) != 0;
+	ArmFeatures[PG_ARM_SVE] = (hwcap & HWCAP_SVE) != 0;
 #else
-	return (getauxval(AT_HWCAP2) & HWCAP2_CRC32) != 0;
+	unsigned long hwcap2 = pg_getauxval(AT_HWCAP2);
+
+	ArmFeatures[PG_ARM_CRC32] = (hwcap2 & HWCAP2_CRC32) != 0;
 #endif
+
 #elif defined(__NetBSD__)
 	/*
 	 * On NetBSD we can read the Instruction Set Attribute Registers via
@@ -92,9 +101,9 @@ pg_crc32c_armv8_available(void)
 	len = sizeof(sysctlbuf);
 	memset(sysctlbuf, 0, len);
 	if (sysctlbyname(path, sysctlbuf, &len, NULL, 0) != 0)
-		return false;			/* perhaps kernel is 64-bit and we aren't? */
+		return;					/* perhaps kernel is 64-bit and we aren't? */
 	if (len != expected_len)
-		return false;			/* kernel API change? */
+		return;					/* kernel API change? */
 
 	/* Fetch the CRC32 field from ISAR0. */
 	fld = (ISAR0 >> ISAR0_CRC32_BITPOS) & WIDTHMASK(ISAR0_CRC32_BITWIDTH);
@@ -104,27 +113,10 @@ pg_crc32c_armv8_available(void)
 	 * (CRC32B/CRC32H/CRC32W/CRC32X/CRC32CB/CRC32CH/CRC32CW/CRC32CX).  Assume
 	 * that any future nonzero value will be a superset of 1.
 	 */
-	return (fld != 0);
-#else
-	return false;
-#endif
-}
+	ArmFeatures[PG_ARM_CRC32] = (fld != 0);
+#endif							/* __NetBSD__ */
 
-/*
- * This gets called on the first call. It replaces the function pointer
- * so that subsequent calls are routed directly to the chosen implementation.
- */
-static pg_crc32c
-pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
-{
-	if (pg_crc32c_armv8_available())
-		pg_comp_crc32c = pg_comp_crc32c_armv8;
-	else
-		pg_comp_crc32c = pg_comp_crc32c_sb8;
-
-	return pg_comp_crc32c(crc, data, len);
+	ArmFeatures[INIT_PG_ARM] = true;
 }
 
-pg_crc32c	(*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose;
-
-#endif      /* __arm__ || __arm || __aarch64__ */
+#endif							/* __arm__ || __arm || __aarch64__ */
diff --git a/src/port/pg_crc32c_armv8.c b/src/port/pg_crc32c_armv8.c
index 9ca0f728d39..a02264a00dc 100644
--- a/src/port/pg_crc32c_armv8.c
+++ b/src/port/pg_crc32c_armv8.c
@@ -20,8 +20,11 @@
 #include <arm_acle.h>
 #endif
 
+#include "port/pg_cpu.h"
 #include "port/pg_crc32c.h"
 
+static pg_crc32c pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len);
+
 pg_crc32c
 pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len)
 {
@@ -77,3 +80,20 @@ pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len)
 
 	return crc;
 }
+
+/*
+ * This gets called on the first call. It replaces the function pointer
+ * so that subsequent calls are routed directly to the chosen implementation.
+ */
+static pg_crc32c
+pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
+{
+	if (arm_feature_available(PG_ARM_CRC32))
+		pg_comp_crc32c = pg_comp_crc32c_armv8;
+	else
+		pg_comp_crc32c = pg_comp_crc32c_sb8;
+
+	return pg_comp_crc32c(crc, data, len);
+}
+
+pg_crc32c	(*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose;
diff --git a/src/port/pg_popcount_aarch64.c b/src/port/pg_popcount_aarch64.c
index f474ef45510..37adff67ce8 100644
--- a/src/port/pg_popcount_aarch64.c
+++ b/src/port/pg_popcount_aarch64.c
@@ -18,17 +18,10 @@
 
 #ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
 #include <arm_sve.h>
-
-#if defined(HAVE_ELF_AUX_INFO) || defined(HAVE_GETAUXVAL)
-#include <sys/auxv.h>
-/* Ancient glibc releases don't include the HWCAPxxx macros in sys/auxv.h */
-#if defined(__linux__) && !defined(HWCAP_SVE)
-#include <asm/hwcap.h>
-#endif
-#endif
 #endif
 
 #include "port/pg_bitutils.h"
+#include "port/pg_cpu.h"
 
 /*
  * The Neon versions are built regardless of whether we are building the SVE
@@ -56,25 +49,10 @@ static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask);
 uint64		(*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose;
 uint64		(*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose;
 
-static inline bool
-pg_popcount_sve_available(void)
-{
-#ifdef HAVE_ELF_AUX_INFO
-	unsigned long value;
-
-	return elf_aux_info(AT_HWCAP, &value, sizeof(value)) == 0 &&
-		(value & HWCAP_SVE) != 0;
-#elif defined(HAVE_GETAUXVAL)
-	return (getauxval(AT_HWCAP) & HWCAP_SVE) != 0;
-#else
-	return false;
-#endif
-}
-
 static inline void
 choose_popcount_functions(void)
 {
-	if (pg_popcount_sve_available())
+	if (arm_feature_available(PG_ARM_SVE))
 	{
 		pg_popcount_optimized = pg_popcount_sve;
 		pg_popcount_masked_optimized = pg_popcount_masked_sve;
-- 
2.53.0

Reply via email to