From 8a33e3737360156b862fc75666630f8043f7d285 Mon Sep 17 00:00:00 2001
From: Andrew Kim <andrew.kim@intel.com>
Date: Thu, 23 Oct 2025 16:35:22 -0700
Subject: [PATCH 1/1] Add AVX2 optimization for page checksum calculation

This patch adds runtime AVX2 detection and optimization for PostgreSQL's
page checksum algorithm while maintaining full backward compatibility.

Key changes:
- Add cross-platform AVX2 CPU detection with XSAVE/YMM register checks
- Implement function pointer dispatch pattern following PostgreSQL conventions
- Use compiler auto-vectorization with pg_attribute_target("avx2")
- Add build system support in both autotools and meson
- Maintain external program compatibility (pg_filedump, etc.)

The implementation uses the same algorithm for both default and AVX2 paths,
allowing the compiler to automatically vectorize the AVX2 version while
preserving identical results. Runtime detection ensures optimal performance
on supported hardware with graceful fallback on older systems.

Addresses reviewer feedback on configure test simplification, Windows
compatibility, and PostgreSQL coding conventions.
---
 config/c-compiler.m4             |  26 ++++++
 configure                        |  52 +++++++++++
 configure.ac                     |   9 ++
 meson.build                      |  30 +++++++
 src/include/pg_config.h.in       |   3 +
 src/include/port/checksum_impl.h | 142 ++++++++++++++++++++++++++++++-
 6 files changed, 261 insertions(+), 1 deletion(-)

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 236a59e8536..40927d56e6a 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -581,6 +581,32 @@ fi
 undefine([Ac_cachevar])dnl
 ])# PGAC_SSE42_CRC32_INTRINSICS
 
+# PGAC_AVX2_SUPPORT
+# ---------------------------
+# Check if the compiler supports AVX2 target attribute.
+# This is used for optimized checksum calculations with runtime detection.
+#
+# If AVX2 target attribute is supported, sets pgac_avx2_support.
+AC_DEFUN([PGAC_AVX2_SUPPORT],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx2_support])])dnl
+AC_CACHE_CHECK([for AVX2 target attribute support], [Ac_cachevar],
+[AC_COMPILE_IFELSE([AC_LANG_PROGRAM([#include <stdint.h>
+    #if defined(__has_attribute) && __has_attribute (target)
+    __attribute__((target("avx2")))
+    static int avx2_test(void)
+    {
+      return 0;
+    }
+    #endif],
+  [return avx2_test();])],
+  [Ac_cachevar=yes],
+  [Ac_cachevar=no])])
+if test x"$Ac_cachevar" = x"yes"; then
+  pgac_avx2_support=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_AVX2_SUPPORT
+
 # PGAC_AVX512_PCLMUL_INTRINSICS
 # ---------------------------
 # Check if the compiler supports AVX-512 carryless multiplication
diff --git a/configure b/configure
index 22cd866147b..209849c773c 100755
--- a/configure
+++ b/configure
@@ -17562,6 +17562,58 @@ $as_echo "#define HAVE_XSAVE_INTRINSICS 1" >>confdefs.h
 
 fi
 
+# Check for AVX2 target and intrinsic support
+#
+if test x"$host_cpu" = x"x86_64"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for AVX2 support" >&5
+$as_echo_n "checking for AVX2 support... " >&6; }
+if ${pgac_cv_avx2_support+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <immintrin.h>
+    #include <stdint.h>
+    #if defined(__has_attribute) && __has_attribute (target)
+    __attribute__((target("avx2")))
+    #endif
+    static int avx2_test(void)
+    {
+      const char buf[sizeof(__m256i)];
+      __m256i accum = _mm256_loadu_si256((const __m256i *) buf);
+	  accum = _mm256_add_epi32(accum, accum);
+      int result = _mm256_extract_epi32(accum, 0);
+      return (int) result;
+    }
+int
+main ()
+{
+return avx2_test();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_avx2_support=yes
+else
+  pgac_cv_avx2_support=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx2_support" >&5
+$as_echo "$pgac_cv_avx2_support" >&6; }
+if test x"$pgac_cv_avx2_support" = x"yes"; then
+  pgac_avx2_support=yes
+fi
+
+  if test x"$pgac_avx2_support" = x"yes"; then
+
+$as_echo "#define USE_AVX2_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+  fi
+fi
+
 # Check for AVX-512 popcount intrinsics
 #
 if test x"$host_cpu" = x"x86_64"; then
diff --git a/configure.ac b/configure.ac
index e44943aa6fe..ca7205d90ac 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2084,6 +2084,15 @@ else
   fi
 fi
 
+# Check for AVX2 target and intrinsic support
+#
+if test x"$host_cpu" = x"x86_64"; then
+  PGAC_AVX2_SUPPORT()
+  if test x"$pgac_avx2_support" = x"yes"; then
+    AC_DEFINE(USE_AVX2_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX2 instructions with a runtime check.])
+  fi
+fi
+
 # Check for XSAVE intrinsics
 #
 PGAC_XSAVE_INTRINSICS()
diff --git a/meson.build b/meson.build
index 395416a6060..5670722944e 100644
--- a/meson.build
+++ b/meson.build
@@ -2293,6 +2293,36 @@ int main(void)
 endif
 
 
+###############################################################
+# Check for the availability of AVX2 support
+###############################################################
+
+if host_cpu == 'x86_64'
+
+  prog = '''
+#include <immintrin.h>
+#include <stdint.h>
+#if defined(__has_attribute) && __has_attribute (target)
+__attribute__((target("avx2")))
+#endif
+static int avx2_test(void)
+{
+    return 0;
+}
+
+int main(void)
+{
+    return avx2_test();
+}
+'''
+
+  if cc.links(prog, name: 'AVX2 support', args: test_c_args)
+    cdata.set('USE_AVX2_WITH_RUNTIME_CHECK', 1)
+  endif
+
+endif
+
+
 ###############################################################
 # Check for the availability of AVX-512 popcount intrinsics.
 ###############################################################
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index c4dc5d72bdb..987f9b5c77c 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -675,6 +675,9 @@
 /* Define to 1 to use AVX-512 CRC algorithms with a runtime check. */
 #undef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
 
+/* Define to 1 to use AVX2 instructions with a runtime check. */
+#undef USE_AVX2_WITH_RUNTIME_CHECK
+
 /* Define to 1 to use AVX-512 popcount instructions with a runtime check. */
 #undef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
 
diff --git a/src/include/port/checksum_impl.h b/src/include/port/checksum_impl.h
index 00cb0549f24..0e1eef45249 100644
--- a/src/include/port/checksum_impl.h
+++ b/src/include/port/checksum_impl.h
@@ -100,8 +100,23 @@
  * manually unroll the inner loop.
  */
 
+#include "pg_config.h"
 #include "storage/bufpage.h"
 
+#ifdef USE_AVX2_WITH_RUNTIME_CHECK
+
+#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
+#include <cpuid.h>
+#endif
+
+#include <immintrin.h>
+
+#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
+#include <intrin.h>
+#endif
+
+#endif
+
 /* number of checksums to calculate in parallel */
 #define N_SUMS 32
 /* prime multiplier of FNV-1a hash */
@@ -114,6 +129,9 @@ typedef union
 	uint32		data[BLCKSZ / (sizeof(uint32) * N_SUMS)][N_SUMS];
 } PGChecksummablePage;
 
+/* Forward declaration */
+static uint32 pg_checksum_block_choose(const PGChecksummablePage *page);
+
 /*
  * Base offsets to initialize each of the parallel FNV hashes into a
  * different initial state.
@@ -129,6 +147,71 @@ static const uint32 checksumBaseOffsets[N_SUMS] = {
 	0x9FBF8C76, 0x15CA20BE, 0xF2CA9FD3, 0x959BD756
 };
 
+#ifdef USE_AVX2_WITH_RUNTIME_CHECK
+
+/*
+ * Does CPUID say there's support for XSAVE instructions?
+ */
+static inline bool
+xsave_available(void)
+{
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+	__cpuid(exx, 1);
+#else
+#error cpuid instruction not available
+#endif
+	return (exx[2] & (1 << 27)) != 0;	/* osxsave */
+}
+
+/*
+ * Does XGETBV say the YMM registers are enabled?
+ *
+ * NB: Caller is responsible for verifying that xsave_available() returns true
+ * before calling this.
+ */
+#ifdef HAVE_XSAVE_INTRINSICS
+pg_attribute_target("xsave")
+#endif
+static inline bool
+ymm_regs_available(void)
+{
+#ifdef HAVE_XSAVE_INTRINSICS
+	return (_xgetbv(0) & 0x06) == 0x06;
+#else
+	return false;
+#endif
+}
+
+/*
+ * Check for AVX2 support using manual CPUID detection
+ */
+static inline bool
+avx2_available(void)
+{
+#ifdef USE_AVX2_WITH_RUNTIME_CHECK
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+	if (!xsave_available() || !ymm_regs_available())
+		return false;
+
+#if defined(HAVE__GET_CPUID_COUNT)
+	__get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUIDEX)
+	__cpuidex(exx, 7, 0);
+#else
+#error cpuid instruction not available
+#endif
+	return (exx[1] & (1 << 5)) != 0; /* avx2 */
+#else
+	return false;
+#endif
+}
+#endif /* USE_AVX2_WITH_RUNTIME_CHECK */
+
 /*
  * Calculate one round of the checksum.
  */
@@ -143,7 +226,7 @@ do { \
  * (at least on 4-byte boundary).
  */
 static uint32
-pg_checksum_block(const PGChecksummablePage *page)
+pg_checksum_block_default(const PGChecksummablePage *page)
 {
 	uint32		sums[N_SUMS];
 	uint32		result = 0;
@@ -173,6 +256,63 @@ pg_checksum_block(const PGChecksummablePage *page)
 	return result;
 }
 
+#ifdef USE_AVX2_WITH_RUNTIME_CHECK
+/*
+ * AVX2-optimized block checksum algorithm.
+ * Same algorithm as default, but compiled with AVX2 target for auto-vectorization.
+ */
+pg_attribute_target("avx2")
+static uint32
+pg_checksum_block_avx2(const PGChecksummablePage *page)
+{
+	uint32		sums[N_SUMS];
+	uint32		result = 0;
+	uint32		i,
+				j;
+
+	/* ensure that the size is compatible with the algorithm */
+	Assert(sizeof(PGChecksummablePage) == BLCKSZ);
+
+	/* initialize partial checksums to their corresponding offsets */
+	memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets));
+
+	/* main checksum calculation */
+	for (i = 0; i < (uint32) (BLCKSZ / (sizeof(uint32) * N_SUMS)); i++)
+		for (j = 0; j < N_SUMS; j++)
+			CHECKSUM_COMP(sums[j], page->data[i][j]);
+
+	/* finally add in two rounds of zeroes for additional mixing */
+	for (i = 0; i < 2; i++)
+		for (j = 0; j < N_SUMS; j++)
+			CHECKSUM_COMP(sums[j], 0);
+
+	/* xor fold partial checksums together */
+	for (i = 0; i < N_SUMS; i++)
+		result ^= sums[i];
+
+	return result;
+}
+#endif
+
+/* Function pointer - external linkage */
+static uint32 (*pg_checksum_block)(const PGChecksummablePage *page) = pg_checksum_block_choose;
+
+/* Choose the best available checksum implementation */
+static uint32
+pg_checksum_block_choose(const PGChecksummablePage *page)
+{
+#ifdef USE_AVX2_WITH_RUNTIME_CHECK
+	if (avx2_available())
+	{
+		pg_checksum_block = pg_checksum_block_avx2;
+		return pg_checksum_block(page);
+	}
+#endif
+	/* fallback to default implementation */
+	pg_checksum_block = pg_checksum_block_default;
+	return pg_checksum_block(page);
+}
+
 /*
  * Compute the checksum for a Postgres page.
  *
-- 
2.43.0

