Attached is v11 to fix headerscheck, per CI. -- John Naylor Amazon Web Services
From 5f60e4457fa6e67b2d186895a4f3e10ac87989ec Mon Sep 17 00:00:00 2001 From: John Naylor <[email protected]> Date: Thu, 8 Jan 2026 18:30:20 +0700 Subject: [PATCH v11 3/3] Enable autovectorizing pg_checksum_block with AVX2 runtime detection
We already rely on autovectorization for computing page checksums, but on x86 we can get about twice the performance by annotating pg_checksum_block() with function target attributes for AVX2, which uses 256-bit registers. WIP: Runtime detection is okay checksum.c for now, but it'd be better to refactor feature detection at some point so it's more centralized. Co-authored-by: Matthew Sterrett <[email protected]> Co-authored-by: Andrew Kim <[email protected]> Reviewed-by: Oleg Tselebrovskiy <[email protected]> Discussion: https://postgr.es/m/CA%2BvA85_5GTu%2BHHniSbvvP%2B8k3%3DxZO%3DWE84NPwiKyxztqvpfZ3Q%40mail.gmail.com Discussion: https://postgr.es/m/20250911054220.3784-1-root%40ip-172-31-36-228.ec2.internal --- config/c-compiler.m4 | 26 ++++ configure | 52 ++++++++ configure.ac | 9 ++ meson.build | 30 +++++ src/backend/storage/page/checksum.c | 112 +++++++++++++++++- src/include/pg_config.h.in | 3 + src/include/storage/checksum_block_internal.h | 42 +++++++ src/include/storage/checksum_impl.h | 48 +++----- src/tools/pginclude/headerscheck | 2 + 9 files changed, 290 insertions(+), 34 deletions(-) create mode 100644 src/include/storage/checksum_block_internal.h diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 1509dbfa2ab..1f3e31fc2d3 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -613,6 +613,32 @@ fi undefine([Ac_cachevar])dnl ])# PGAC_SSE42_CRC32_INTRINSICS +# PGAC_AVX2_SUPPORT +# --------------------------- +# Check if the compiler supports AVX2 target attribute. +# This is used for optimized checksum calculations with runtime detection. +# +# If AVX2 target attribute is supported, sets pgac_avx2_support. +AC_DEFUN([PGAC_AVX2_SUPPORT], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx2_support])])dnl +AC_CACHE_CHECK([for AVX2 target attribute support], [Ac_cachevar], +[AC_COMPILE_IFELSE([AC_LANG_PROGRAM([#include <stdint.h> + #if defined(__has_attribute) && __has_attribute (target) + __attribute__((target("avx2"))) + static int avx2_test(void) + { + return 0; + } + #endif], + [return avx2_test();])], + [Ac_cachevar=yes], + [Ac_cachevar=no])]) +if test x"$Ac_cachevar" = x"yes"; then + pgac_avx2_support=yes +fi +undefine([Ac_cachevar])dnl +])# PGAC_AVX2_SUPPORT + # PGAC_AVX512_PCLMUL_INTRINSICS # --------------------------- # Check if the compiler supports AVX-512 carryless multiplication diff --git a/configure b/configure index 04eeb1a741c..72c935c5d83 100755 --- a/configure +++ b/configure @@ -17680,6 +17680,58 @@ $as_echo "#define HAVE_XSAVE_INTRINSICS 1" >>confdefs.h fi +# Check for AVX2 target and intrinsic support +# +if test x"$host_cpu" = x"x86_64"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for AVX2 support" >&5 +$as_echo_n "checking for AVX2 support... " >&6; } +if ${pgac_cv_avx2_support+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include <immintrin.h> + #include <stdint.h> + #if defined(__has_attribute) && __has_attribute (target) + __attribute__((target("avx2"))) + #endif + static int avx2_test(void) + { + const char buf[sizeof(__m256i)]; + __m256i accum = _mm256_loadu_si256((const __m256i *) buf); + accum = _mm256_add_epi32(accum, accum); + int result = _mm256_extract_epi32(accum, 0); + return (int) result; + } +int +main () +{ +return avx2_test(); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_avx2_support=yes +else + pgac_cv_avx2_support=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx2_support" >&5 +$as_echo "$pgac_cv_avx2_support" >&6; } +if test x"$pgac_cv_avx2_support" = x"yes"; then + pgac_avx2_support=yes +fi + + if test x"$pgac_avx2_support" = x"yes"; then + +$as_echo "#define USE_AVX2_WITH_RUNTIME_CHECK 1" >>confdefs.h + + fi +fi + # Check for AVX-512 popcount intrinsics # if test x"$host_cpu" = x"x86_64"; then diff --git a/configure.ac b/configure.ac index 13c75170f7a..c2180111044 100644 --- a/configure.ac +++ b/configure.ac @@ -2089,6 +2089,15 @@ else fi fi +# Check for AVX2 target and intrinsic support +# +if test x"$host_cpu" = x"x86_64"; then + PGAC_AVX2_SUPPORT() + if test x"$pgac_avx2_support" = x"yes"; then + AC_DEFINE(USE_AVX2_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX2 instructions with a runtime check.]) + fi +fi + # Check for XSAVE intrinsics # PGAC_XSAVE_INTRINSICS() diff --git a/meson.build b/meson.build index 6d304f32fb0..61620cbe37a 100644 --- a/meson.build +++ b/meson.build @@ -2348,6 +2348,36 @@ int main(void) endif +############################################################### +# Check for the availability of AVX2 support +############################################################### + +if host_cpu == 'x86_64' + + prog = ''' +#include <immintrin.h> +#include <stdint.h> +#if defined(__has_attribute) && __has_attribute (target) +__attribute__((target("avx2"))) +#endif +static int avx2_test(void) +{ + return 0; +} + +int main(void) +{ + return avx2_test(); +} +''' + + if cc.links(prog, name: 'AVX2 support', args: test_c_args) + cdata.set('USE_AVX2_WITH_RUNTIME_CHECK', 1) + endif + +endif + + ############################################################### # Check for the availability of AVX-512 popcount intrinsics. ############################################################### diff --git a/src/backend/storage/page/checksum.c b/src/backend/storage/page/checksum.c index 8716651c8b5..55ebe988411 100644 --- a/src/backend/storage/page/checksum.c +++ b/src/backend/storage/page/checksum.c @@ -13,10 +13,120 @@ */ #include "postgres.h" +#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT) +#include <cpuid.h> +#endif + +#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX) +#include <intrin.h> +#endif + +#ifdef HAVE_XSAVE_INTRINSICS +#include <immintrin.h> +#endif + #include "storage/checksum.h" + /* * The actual code is in storage/checksum_impl.h. This is done so that * external programs can incorporate the checksum code by #include'ing - * that file from the exported Postgres headers. (Compare our CRC code.) + * that file from the exported Postgres headers. (Compare our legacy + * CRC code in pg_crc.h.) + * The PG_CHECKSUM_INTERNAL symbol allows core to use hardware-specific + * coding without affecting external programs. */ +#define PG_CHECKSUM_INTERNAL #include "storage/checksum_impl.h" /* IWYU pragma: keep */ + + +/* WIP: the feature detection should go in src/port */ + +/* + * Does CPUID say there's support for XSAVE instructions? + */ +static inline bool +xsave_available(void) +{ + unsigned int exx[4] = {0, 0, 0, 0}; + +#if defined(HAVE__GET_CPUID) + __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); +#elif defined(HAVE__CPUID) + __cpuid(exx, 1); +#endif + return (exx[2] & (1 << 27)) != 0; /* osxsave */ +} + +/* + * Does XGETBV say the YMM registers are enabled? + * + * NB: Caller is responsible for verifying that xsave_available() returns true + * before calling this. + */ +#ifdef HAVE_XSAVE_INTRINSICS +pg_attribute_target("xsave") +#endif +static inline bool +ymm_regs_available(void) +{ +#ifdef HAVE_XSAVE_INTRINSICS + return (_xgetbv(0) & 0x06) == 0x06; +#else + return false; +#endif +} + +/* + * Check for AVX2 support using CPUID detection + */ +static inline bool +avx2_available(void) +{ + unsigned int exx[4] = {0, 0, 0, 0}; + +#if defined(HAVE__GET_CPUID_COUNT) + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); +#elif defined(HAVE__CPUIDEX) + __cpuidex(exx, 7, 0); +#endif + + return (exx[1] & (1 << 5)) != 0; /* avx2 */ +} + +static uint32 +pg_checksum_block_fallback(const PGChecksummablePage *page) +{ +#include "storage/checksum_block_internal.h" +} + +/* + * AVX2-optimized block checksum algorithm. + */ +#ifdef USE_AVX2_WITH_RUNTIME_CHECK +pg_attribute_target("avx2") +static uint32 +pg_checksum_block_avx2(const PGChecksummablePage *page) +{ +#include "storage/checksum_block_internal.h" +} +#endif /* USE_AVX2_WITH_RUNTIME_CHECK */ + +/* + * Choose the best available checksum implementation. + */ +static uint32 +pg_checksum_choose(const PGChecksummablePage *page) +{ +#ifdef USE_AVX2_WITH_RUNTIME_CHECK + if (xsave_available() && + ymm_regs_available() && + avx2_available()) + pg_checksum_block = pg_checksum_block_avx2; + else +#endif + pg_checksum_block = pg_checksum_block_fallback; + + return pg_checksum_block(page); +} + +static uint32 (*pg_checksum_block) (const PGChecksummablePage *page) = pg_checksum_choose; diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 339268dc8ef..1e43e9b2bc4 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -665,6 +665,9 @@ /* Define to 1 to use AVX-512 CRC algorithms with a runtime check. */ #undef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK +/* Define to 1 to use AVX2 instructions with a runtime check. */ +#undef USE_AVX2_WITH_RUNTIME_CHECK + /* Define to 1 to use AVX-512 popcount instructions with a runtime check. */ #undef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK diff --git a/src/include/storage/checksum_block_internal.h b/src/include/storage/checksum_block_internal.h new file mode 100644 index 00000000000..b4e6987d6b5 --- /dev/null +++ b/src/include/storage/checksum_block_internal.h @@ -0,0 +1,42 @@ +/*------------------------------------------------------------------------- + * + * checksum_block_internal.h + * Core algorithm for page checksums , semi private to checksum_impl.h + * and checksum.c. + * + * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/checksum_block_internal.h + * + *------------------------------------------------------------------------- + */ + +/* there is deliberately not an #ifndef CHECKSUM_BLOCK_INTERNAL_H here */ + +uint32 sums[N_SUMS]; +uint32 result = 0; +uint32 i, + j; + +/* ensure that the size is compatible with the algorithm */ +Assert(sizeof(PGChecksummablePage) == BLCKSZ); + +/* initialize partial checksums to their corresponding offsets */ +memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets)); + +/* main checksum calculation */ +for (i = 0; i < (uint32) (BLCKSZ / (sizeof(uint32) * N_SUMS)); i++) + for (j = 0; j < N_SUMS; j++) + CHECKSUM_COMP(sums[j], page->data[i][j]); + +/* finally add in two rounds of zeroes for additional mixing */ +for (i = 0; i < 2; i++) + for (j = 0; j < N_SUMS; j++) + CHECKSUM_COMP(sums[j], 0); + +/* xor fold partial checksums together */ +for (i = 0; i < N_SUMS; i++) + result ^= sums[i]; + +return result; diff --git a/src/include/storage/checksum_impl.h b/src/include/storage/checksum_impl.h index 5c2dcbc63e7..8a308e423c3 100644 --- a/src/include/storage/checksum_impl.h +++ b/src/include/storage/checksum_impl.h @@ -73,11 +73,10 @@ * 2e-16 false positive rate within margin of error. * * Vectorization of the algorithm requires 32bit x 32bit -> 32bit integer - * multiplication instruction. As of 2013 the corresponding instruction is - * available on x86 SSE4.1 extensions (pmulld) and ARM NEON (vmul.i32). - * Vectorization requires a compiler to do the vectorization for us. For recent - * GCC versions the flags -msse4.1 -funroll-loops -ftree-vectorize are enough - * to achieve vectorization. + * multiplication instruction. Examples include x86 AVX2 extensions (vpmulld) + * and ARM NEON (vmul.i32). For simplicity we rely on the compiler to do the + * vectorization for us. For GCC and clang the flags -funroll-loops + * -ftree-vectorize are enough to achieve vectorization. * * The optimal amount of parallelism to use depends on CPU specific instruction * latency, SIMD instruction width, throughput and the amount of registers @@ -89,8 +88,9 @@ * * The parallelism number 32 was chosen based on the fact that it is the * largest state that fits into architecturally visible x86 SSE registers while - * leaving some free registers for intermediate values. For future processors - * with 256bit vector registers this will leave some performance on the table. + * leaving some free registers for intermediate values. For processors + * with 256bit vector registers this leaves some performance on the table. + * * When vectorization is not available it might be beneficial to restructure * the computation to calculate a subset of the columns at a time and perform * multiple passes to avoid register spilling. This optimization opportunity @@ -138,6 +138,9 @@ do { \ (checksum) = __tmp * FNV_PRIME ^ (__tmp >> 17); \ } while (0) +/* Provide a static definition for external programs */ +#ifndef PG_CHECKSUM_INTERNAL + /* * Block checksum algorithm. The page must be adequately aligned * (at least on 4-byte boundary). @@ -145,34 +148,13 @@ do { \ static uint32 pg_checksum_block(const PGChecksummablePage *page) { - uint32 sums[N_SUMS]; - uint32 result = 0; - uint32 i, - j; - - /* ensure that the size is compatible with the algorithm */ - Assert(sizeof(PGChecksummablePage) == BLCKSZ); - - /* initialize partial checksums to their corresponding offsets */ - memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets)); - - /* main checksum calculation */ - for (i = 0; i < (uint32) (BLCKSZ / (sizeof(uint32) * N_SUMS)); i++) - for (j = 0; j < N_SUMS; j++) - CHECKSUM_COMP(sums[j], page->data[i][j]); - - /* finally add in two rounds of zeroes for additional mixing */ - for (i = 0; i < 2; i++) - for (j = 0; j < N_SUMS; j++) - CHECKSUM_COMP(sums[j], 0); - - /* xor fold partial checksums together */ - for (i = 0; i < N_SUMS; i++) - result ^= sums[i]; - - return result; +#include "storage/checksum_block_internal.h" } +#else +static uint32 (*pg_checksum_block) (const PGChecksummablePage *page); +#endif + /* * Compute the checksum for a Postgres page. * diff --git a/src/tools/pginclude/headerscheck b/src/tools/pginclude/headerscheck index 7a6755991bb..569e749b25a 100755 --- a/src/tools/pginclude/headerscheck +++ b/src/tools/pginclude/headerscheck @@ -154,6 +154,8 @@ do test "$f" = src/include/catalog/syscache_ids.h && continue test "$f" = src/include/catalog/syscache_info.h && continue + test "$f" = src/include/storage/checksum_block_internal.h && continue + # We can't make these Bison output files compilable standalone # without using "%code require", which old Bison versions lack. # parser/gram.h will be included by parser/gramparse.h anyway. -- 2.52.0
From b6a3aba8f9568684cf22af42584bec65b6170668 Mon Sep 17 00:00:00 2001 From: John Naylor <[email protected]> Date: Fri, 9 Jan 2026 17:07:37 +0700 Subject: [PATCH v11 2/3] Adjust benchmark to use core checksum XXX not for commit --- contrib/pg_checksum_bench/pg_checksum_bench.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/contrib/pg_checksum_bench/pg_checksum_bench.c b/contrib/pg_checksum_bench/pg_checksum_bench.c index dc20395a590..61da664e723 100644 --- a/contrib/pg_checksum_bench/pg_checksum_bench.c +++ b/contrib/pg_checksum_bench/pg_checksum_bench.c @@ -1,7 +1,6 @@ #include "postgres.h" #include "fmgr.h" -#include "port/checksum.h" -#include "port/checksum_impl.h" +#include "storage/checksum.h" #include <stdio.h> #include <assert.h> @@ -15,23 +14,23 @@ Datum drive_pg_checksum(PG_FUNCTION_ARGS) { int page_count = PG_GETARG_INT32(0); - PGChecksummablePage *pages; + char *pages; int i; size_t j; - pages = palloc(page_count * sizeof(PGChecksummablePage)); + pages = palloc(page_count * BLCKSZ); srand(0); - for (j = 0; j < page_count * sizeof(PGChecksummablePage); j++) + for (j = 0; j < page_count * BLCKSZ; j++) { - char *byte_ptr = (char *) pages; + char *byte_ptr = pages; byte_ptr[j] = rand() % 256; } for (i = 0; i < REPEATS; i++) { - const PGChecksummablePage *test_page = pages + (i % page_count); - volatile uint32 result = pg_checksum_block_choose((const char *) test_page); + char *test_page = pages + (i % page_count); + volatile uint32 result = pg_checksum_page((char *) test_page, 0); (void) result; } -- 2.52.0
From 97a24b6da8fddaaafc2ed434dabf14a53bd6eecb Mon Sep 17 00:00:00 2001 From: Andrew Kim <[email protected]> Date: Wed, 5 Nov 2025 14:37:29 -0800 Subject: [PATCH v11 1/3] Benchmark code for postgres checksums Add pg_checksum_bench extension for performance testing of checksum implementations with AVX2 optimization. XXX not for commit --- contrib/meson.build | 1 + contrib/pg_checksum_bench/meson.build | 23 ++++++++++ .../pg_checksum_bench--1.0.sql | 8 ++++ contrib/pg_checksum_bench/pg_checksum_bench.c | 42 +++++++++++++++++++ .../pg_checksum_bench.control | 4 ++ .../sql/pg_checksum_bench.sql | 17 ++++++++ 6 files changed, 95 insertions(+) create mode 100644 contrib/pg_checksum_bench/meson.build create mode 100644 contrib/pg_checksum_bench/pg_checksum_bench--1.0.sql create mode 100644 contrib/pg_checksum_bench/pg_checksum_bench.c create mode 100644 contrib/pg_checksum_bench/pg_checksum_bench.control create mode 100644 contrib/pg_checksum_bench/sql/pg_checksum_bench.sql diff --git a/contrib/meson.build b/contrib/meson.build index def13257cbe..98fe47b5b9b 100644 --- a/contrib/meson.build +++ b/contrib/meson.build @@ -12,6 +12,7 @@ contrib_doc_args = { 'install_dir': contrib_doc_dir, } +subdir('pg_checksum_bench') subdir('amcheck') subdir('auth_delay') subdir('auto_explain') diff --git a/contrib/pg_checksum_bench/meson.build b/contrib/pg_checksum_bench/meson.build new file mode 100644 index 00000000000..32ccd9efa0f --- /dev/null +++ b/contrib/pg_checksum_bench/meson.build @@ -0,0 +1,23 @@ +# Copyright (c) 2022-2025, PostgreSQL Global Development Group + +pg_checksum_bench_sources = files( + 'pg_checksum_bench.c', +) + +if host_system == 'windows' + pg_checksum_bench_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'pg_checksum_bench', + '--FILEDESC', 'pg_checksum_bench',]) +endif + +pg_checksum_bench = shared_module('pg_checksum_bench', + pg_checksum_bench_sources, + kwargs: contrib_mod_args, +) +contrib_targets += pg_checksum_bench + +install_data( + 'pg_checksum_bench--1.0.sql', + 'pg_checksum_bench.control', + kwargs: contrib_data_args, +) diff --git a/contrib/pg_checksum_bench/pg_checksum_bench--1.0.sql b/contrib/pg_checksum_bench/pg_checksum_bench--1.0.sql new file mode 100644 index 00000000000..5f13cbe3c5e --- /dev/null +++ b/contrib/pg_checksum_bench/pg_checksum_bench--1.0.sql @@ -0,0 +1,8 @@ +/* contrib/pg_checksum_bench/pg_checksum_bench--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +-- \echo Use "CREATE EXTENSION pg_checksum_bench" to load this file. \quit + +CREATE FUNCTION drive_pg_checksum(page_count int) + RETURNS pg_catalog.void + AS 'MODULE_PATHNAME' LANGUAGE C; diff --git a/contrib/pg_checksum_bench/pg_checksum_bench.c b/contrib/pg_checksum_bench/pg_checksum_bench.c new file mode 100644 index 00000000000..dc20395a590 --- /dev/null +++ b/contrib/pg_checksum_bench/pg_checksum_bench.c @@ -0,0 +1,42 @@ +#include "postgres.h" +#include "fmgr.h" +#include "port/checksum.h" +#include "port/checksum_impl.h" + +#include <stdio.h> +#include <assert.h> + +PG_MODULE_MAGIC; + +#define REPEATS 1000000 + +PG_FUNCTION_INFO_V1(drive_pg_checksum); +Datum +drive_pg_checksum(PG_FUNCTION_ARGS) +{ + int page_count = PG_GETARG_INT32(0); + PGChecksummablePage *pages; + int i; + size_t j; + + pages = palloc(page_count * sizeof(PGChecksummablePage)); + srand(0); + for (j = 0; j < page_count * sizeof(PGChecksummablePage); j++) + { + char *byte_ptr = (char *) pages; + + byte_ptr[j] = rand() % 256; + } + + for (i = 0; i < REPEATS; i++) + { + const PGChecksummablePage *test_page = pages + (i % page_count); + volatile uint32 result = pg_checksum_block_choose((const char *) test_page); + + (void) result; + } + + pfree((void *) pages); + + PG_RETURN_VOID(); +} diff --git a/contrib/pg_checksum_bench/pg_checksum_bench.control b/contrib/pg_checksum_bench/pg_checksum_bench.control new file mode 100644 index 00000000000..4a4e2c9363c --- /dev/null +++ b/contrib/pg_checksum_bench/pg_checksum_bench.control @@ -0,0 +1,4 @@ +comment = 'pg_checksum benchmark' +default_version = '1.0' +module_pathname = '$libdir/pg_checksum_bench' +relocatable = true diff --git a/contrib/pg_checksum_bench/sql/pg_checksum_bench.sql b/contrib/pg_checksum_bench/sql/pg_checksum_bench.sql new file mode 100644 index 00000000000..4b347699953 --- /dev/null +++ b/contrib/pg_checksum_bench/sql/pg_checksum_bench.sql @@ -0,0 +1,17 @@ +CREATE EXTENSION pg_checksum_bench; + +SELECT drive_pg_checksum(-1); + +\timing on + +SELECT drive_pg_checksum(1); +SELECT drive_pg_checksum(2); +SELECT drive_pg_checksum(4); +SELECT drive_pg_checksum(8); +SELECT drive_pg_checksum(16); +SELECT drive_pg_checksum(32); +SELECT drive_pg_checksum(64); +SELECT drive_pg_checksum(128); +SELECT drive_pg_checksum(256); +SELECT drive_pg_checksum(512); +SELECT drive_pg_checksum(1024); -- 2.52.0
