On Thu, Feb 19, 2026 at 1:47 AM Zsolt Parragi <[email protected]> wrote: > > > Done. I haven't tried Arm support yet, but now I realize the header > > should be named generically, so it's now "pg_cpu.h". Then it can be > > included everywhere. > > That makes sense, and simplifies the usage of the header. (However, > the include guard still refers to the old name)
Oops, fixed. > > I don't know. The instruction family names are conventionally all in > > caps, but this is just our signal that we've populated the array. That > > said, a less generic name would better for grep-ability. > > Yes, that could work too. But reserving the lowercase "init" symbol in > a very generic header seems like a bad idea (especially for a use case > that isn't used globally), even if Postgres itself doesn't use the > symbol for anything else. "INIT" at least would be unlikely to > conflict with something else. Still seems pretty generic, so I went with INIT_PG_X86. I've also made a quick attempt at Arm support just to make sure I didn't paint myself into a corner (v4-0005-6), and it compiles and passes tests on a Debian aarch64 system with gcc 8.3. I'll put that aside for later. v4-0001-3 are still the main focus now, and seem in decent shape, maybe needs a bit more polish. (not to mention formal commit messages) -- John Naylor Amazon Web Services
From 480fc1ad8ee8e0c5cecfd0ea7348b877feb77944 Mon Sep 17 00:00:00 2001 From: John Naylor <[email protected]> Date: Thu, 12 Feb 2026 12:45:23 +0700 Subject: [PATCH v4 2/6] Centralize detection of CPU features WIP: x86 only --- src/include/port/pg_cpu.h | 50 +++++++++++++++++++++ src/port/pg_cpu_x86.c | 62 +++++++++++--------------- src/port/pg_crc32c_sse42.c | 28 ++++++++++++ src/port/pg_popcount_x86.c | 91 ++------------------------------------ 4 files changed, 107 insertions(+), 124 deletions(-) create mode 100644 src/include/port/pg_cpu.h diff --git a/src/include/port/pg_cpu.h b/src/include/port/pg_cpu.h new file mode 100644 index 00000000000..c9e03d016e9 --- /dev/null +++ b/src/include/port/pg_cpu.h @@ -0,0 +1,50 @@ +/*------------------------------------------------------------------------- + * + * pg_cpu.h + * Runtime CPU feature detection + * + * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/port/pg_cpu.h + * + *------------------------------------------------------------------------- + */ +#ifndef PG_CPU_H +#define PG_CPU_H + +#if defined(USE_SSE2) || defined(__i386__) + +typedef enum X86FeatureId +{ + /* Have we run feature detection? */ + INIT_PG_X86, + + /* scalar and 128-bit registers */ + PG_SSE4_2, + PG_POPCNT, + + /* 512-bit registers */ + PG_AVX512_BW, + PG_AVX512_VL, + PG_AVX512_VPCLMULQDQ, + PG_AVX512_VPOPCNTDQ, +} X86FeatureId; +#define X86FeaturesSize (PG_AVX512_VPOPCNTDQ + 1) + +extern PGDLLIMPORT bool X86Features[]; + +extern void set_x86_features(void); + +static inline bool +x86_feature_available(X86FeatureId feature) +{ + if (X86Features[INIT_PG_X86] == false) + set_x86_features(); + + return X86Features[feature]; +} + +#endif /* defined(USE_SSE2) || defined(__i386__) */ + +#endif /* PG_CPU_H */ diff --git a/src/port/pg_cpu_x86.c b/src/port/pg_cpu_x86.c index 998a70ffa41..725bd1f68c8 100644 --- a/src/port/pg_cpu_x86.c +++ b/src/port/pg_cpu_x86.c @@ -1,12 +1,7 @@ /*------------------------------------------------------------------------- * * pg_cpu_x86.c - * Choose between Intel SSE 4.2 and software CRC-32C implementation. - * - * On first call, checks if the CPU we're running on supports Intel SSE - * 4.2. If it does, use the special SSE instructions for CRC-32C - * computation. Otherwise, fall back to the pure software implementation - * (slicing-by-8). + * Runtime CPU feature detection for x86 * * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -34,7 +29,10 @@ #include <immintrin.h> #endif -#include "port/pg_crc32c.h" +#include "port/pg_cpu.h" + + +bool X86Features[X86FeaturesSize] = {0}; /* * Does XGETBV say the ZMM registers are enabled? @@ -56,22 +54,13 @@ zmm_regs_available(void) } /* - * This gets called on the first call. It replaces the function pointer - * so that subsequent calls are routed directly to the chosen implementation. + * Parse the CPU ID info for runtime checks. */ -static pg_crc32c -pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) +void +set_x86_features(void) { unsigned int exx[4] = {0, 0, 0, 0}; - /* - * Set fallback. We must guard since slicing-by-8 is not visible - * everywhere. - */ -#ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK - pg_comp_crc32c = pg_comp_crc32c_sb8; -#endif - #if defined(HAVE__GET_CPUID) __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); #elif defined(HAVE__CPUID) @@ -80,34 +69,33 @@ pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) #error cpuid instruction not available #endif - if ((exx[2] & (1 << 20)) != 0) /* SSE 4.2 */ - { - pg_comp_crc32c = pg_comp_crc32c_sse42; + X86Features[PG_SSE4_2] = exx[2] >> 20 & 1; + X86Features[PG_POPCNT] = exx[2] >> 23 & 1; - if (exx[2] & (1 << 27) && /* OSXSAVE */ - zmm_regs_available()) - { - /* second cpuid call on leaf 7 to check extended AVX-512 support */ + /* All these features depend on OSXSAVE */ + if (exx[2] & (1 << 27)) + { + /* second cpuid call on leaf 7 to check extended AVX-512 support */ - memset(exx, 0, 4 * sizeof(exx[0])); + memset(exx, 0, 4 * sizeof(exx[0])); #if defined(HAVE__GET_CPUID_COUNT) - __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); #elif defined(HAVE__CPUIDEX) - __cpuidex(exx, 7, 0); + __cpuidex(exx, 7, 0); #endif -#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK - if (exx[2] & (1 << 10) && /* VPCLMULQDQ */ - exx[1] & (1 << 31)) /* AVX512-VL */ - pg_comp_crc32c = pg_comp_crc32c_avx512; -#endif + if (zmm_regs_available()) + { + X86Features[PG_AVX512_BW] = exx[1] >> 30 & 1; + X86Features[PG_AVX512_VL] = exx[1] >> 31 & 1; + + X86Features[PG_VPCLMULQDQ] = exx[2] >> 10 & 1; + X86Features[PG_AVX512_VPOPCNTDQ] = exx[2] >> 14 & 1; } } - return pg_comp_crc32c(crc, data, len); + X86Features[INIT_PG_X86] = true; } -pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose; - #endif /* defined(USE_SSE2) || defined(__i386__) */ diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c index c1279d31fbd..2e740e12a7a 100644 --- a/src/port/pg_crc32c_sse42.c +++ b/src/port/pg_crc32c_sse42.c @@ -20,6 +20,9 @@ #endif #include "port/pg_crc32c.h" +#include "port/pg_cpu.h" + +static pg_crc32c pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len); pg_attribute_no_sanitize_alignment() pg_attribute_target("sse4.2") @@ -159,3 +162,28 @@ pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t len) } #endif + +static pg_crc32c +pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) +{ + /* + * Set fallback. We must guard since slicing-by-8 is not visible + * everywhere. + */ +#ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK + pg_comp_crc32c = pg_comp_crc32c_sb8; +#endif + + if (x86_feature_available(PG_SSE4_2)) + pg_comp_crc32c = pg_comp_crc32c_sse42; + +#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK + if (x86_feature_available(PG_AVX512_VL) && + x86_feature_available(PG_VPCLMULQDQ)) + pg_comp_crc32c = pg_comp_crc32c_avx512; +#endif + + return pg_comp_crc32c(crc, data, len); +}; + +pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose; diff --git a/src/port/pg_popcount_x86.c b/src/port/pg_popcount_x86.c index 6bce089432f..a99613f1818 100644 --- a/src/port/pg_popcount_x86.c +++ b/src/port/pg_popcount_x86.c @@ -14,19 +14,12 @@ #ifdef HAVE_X86_64_POPCNTQ -#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT) -#include <cpuid.h> -#endif - #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK #include <immintrin.h> #endif -#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX) -#include <intrin.h> -#endif - #include "port/pg_bitutils.h" +#include "port/pg_cpu.h" /* * The SSE4.2 versions are built regardless of whether we are building the @@ -58,84 +51,9 @@ static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask); uint64 (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose; uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose; -/* - * Return true if CPUID indicates that the POPCNT instruction is available. - */ -static bool -pg_popcount_sse42_available(void) -{ - unsigned int exx[4] = {0, 0, 0, 0}; - -#if defined(HAVE__GET_CPUID) - __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); -#elif defined(HAVE__CPUID) - __cpuid(exx, 1); -#else -#error cpuid instruction not available -#endif - - return (exx[2] & (1 << 23)) != 0; /* POPCNT */ -} #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK -/* - * Does CPUID say there's support for XSAVE instructions? - */ -static inline bool -xsave_available(void) -{ - unsigned int exx[4] = {0, 0, 0, 0}; - -#if defined(HAVE__GET_CPUID) - __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); -#elif defined(HAVE__CPUID) - __cpuid(exx, 1); -#else -#error cpuid instruction not available -#endif - return (exx[2] & (1 << 27)) != 0; /* osxsave */ -} - -/* - * Does XGETBV say the ZMM registers are enabled? - * - * NB: Caller is responsible for verifying that xsave_available() returns true - * before calling this. - */ -#ifdef HAVE_XSAVE_INTRINSICS -pg_attribute_target("xsave") -#endif -static inline bool -zmm_regs_available(void) -{ -#ifdef HAVE_XSAVE_INTRINSICS - return (_xgetbv(0) & 0xe6) == 0xe6; -#else - return false; -#endif -} - -/* - * Does CPUID say there's support for AVX-512 popcount and byte-and-word - * instructions? - */ -static inline bool -avx512_popcnt_available(void) -{ - unsigned int exx[4] = {0, 0, 0, 0}; - -#if defined(HAVE__GET_CPUID_COUNT) - __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); -#elif defined(HAVE__CPUIDEX) - __cpuidex(exx, 7, 0); -#else -#error cpuid instruction not available -#endif - return (exx[2] & (1 << 14)) != 0 && /* avx512-vpopcntdq */ - (exx[1] & (1 << 30)) != 0; /* avx512-bw */ -} - /* * Returns true if the CPU supports the instructions required for the AVX-512 * pg_popcount() implementation. @@ -143,9 +61,8 @@ avx512_popcnt_available(void) static bool pg_popcount_avx512_available(void) { - return xsave_available() && - zmm_regs_available() && - avx512_popcnt_available(); + return x86_feature_available(PG_AVX512_BW) && + x86_feature_available(PG_AVX512_VPOPCNTDQ); } #endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */ @@ -159,7 +76,7 @@ pg_popcount_avx512_available(void) static inline void choose_popcount_functions(void) { - if (pg_popcount_sse42_available()) + if (x86_feature_available(PG_POPCNT)) { pg_popcount_optimized = pg_popcount_sse42; pg_popcount_masked_optimized = pg_popcount_masked_sse42; -- 2.53.0
From 7ea446c6cf79c7f5924b601b9e10d0de2c069c14 Mon Sep 17 00:00:00 2001 From: John Naylor <[email protected]> Date: Wed, 11 Feb 2026 14:34:18 +0700 Subject: [PATCH v4 1/6] Rename CRC "choose" files for future general purpose WIP: x86 only --- configure | 4 ++-- configure.ac | 4 ++-- src/port/Makefile | 1 + src/port/meson.build | 3 +-- src/port/{pg_crc32c_sse42_choose.c => pg_cpu_x86.c} | 8 ++++++-- 5 files changed, 12 insertions(+), 8 deletions(-) rename src/port/{pg_crc32c_sse42_choose.c => pg_cpu_x86.c} (94%) diff --git a/configure b/configure index a10a2c85c6a..185703289b4 100755 --- a/configure +++ b/configure @@ -18196,7 +18196,7 @@ if test x"$USE_SSE42_CRC32C" = x"1"; then $as_echo "#define USE_SSE42_CRC32C 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o" + PG_CRC32C_OBJS="pg_crc32c_sse42.o" { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5 $as_echo "SSE 4.2" >&6; } else @@ -18204,7 +18204,7 @@ else $as_echo "#define USE_SSE42_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_crc32c_sse42_choose.o" + PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o" { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5 $as_echo "SSE 4.2 with runtime check" >&6; } else diff --git a/configure.ac b/configure.ac index 814e64a967e..0955b7e4371 100644 --- a/configure.ac +++ b/configure.ac @@ -2245,12 +2245,12 @@ fi AC_MSG_CHECKING([which CRC-32C implementation to use]) if test x"$USE_SSE42_CRC32C" = x"1"; then AC_DEFINE(USE_SSE42_CRC32C, 1, [Define to 1 use Intel SSE 4.2 CRC instructions.]) - PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o" + PG_CRC32C_OBJS="pg_crc32c_sse42.o" AC_MSG_RESULT(SSE 4.2) else if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then AC_DEFINE(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check.]) - PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_crc32c_sse42_choose.o" + PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o" AC_MSG_RESULT(SSE 4.2 with runtime check) else if test x"$USE_ARMV8_CRC32C" = x"1"; then diff --git a/src/port/Makefile b/src/port/Makefile index 6e3b7d154ed..47cfea1507d 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -44,6 +44,7 @@ OBJS = \ noblock.o \ path.o \ pg_bitutils.o \ + pg_cpu_x86.o \ pg_localeconv_r.o \ pg_numa.o \ pg_popcount_aarch64.o \ diff --git a/src/port/meson.build b/src/port/meson.build index d7d4e705b89..edb2e5632bd 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -7,6 +7,7 @@ pgport_sources = [ 'noblock.c', 'path.c', 'pg_bitutils.c', + 'pg_cpu_x86.c', 'pg_localeconv_r.c', 'pg_numa.c', 'pg_popcount_aarch64.c', @@ -86,8 +87,6 @@ replace_funcs_pos = [ # x86/x64 ['pg_crc32c_sse42', 'USE_SSE42_CRC32C'], ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], - ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C'], - ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], # arm / aarch64 diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_cpu_x86.c similarity index 94% rename from src/port/pg_crc32c_sse42_choose.c rename to src/port/pg_cpu_x86.c index f586476964f..998a70ffa41 100644 --- a/src/port/pg_crc32c_sse42_choose.c +++ b/src/port/pg_cpu_x86.c @@ -1,6 +1,6 @@ /*------------------------------------------------------------------------- * - * pg_crc32c_sse42_choose.c + * pg_cpu_x86.c * Choose between Intel SSE 4.2 and software CRC-32C implementation. * * On first call, checks if the CPU we're running on supports Intel SSE @@ -13,13 +13,15 @@ * * * IDENTIFICATION - * src/port/pg_crc32c_sse42_choose.c + * src/port/pg_cpu_x86.c * *------------------------------------------------------------------------- */ #include "c.h" +#if defined(USE_SSE2) || defined(__i386__) + #if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT) #include <cpuid.h> #endif @@ -107,3 +109,5 @@ pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) } pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose; + +#endif /* defined(USE_SSE2) || defined(__i386__) */ -- 2.53.0
From 03e6fdd45f4a9bc1bd045feff2ee395c45cc9a88 Mon Sep 17 00:00:00 2001 From: John Naylor <[email protected]> Date: Fri, 13 Feb 2026 18:11:39 +0700 Subject: [PATCH v4 3/6] Refactor the detection of ZMM registers - Call _xgetbv within x86_set_runtime_features rather than in a separate function - Use symbols for XCR mask bits rather than a magic constant A future commit will build on this to detect YMM registers without code duplication. --- src/port/pg_cpu_x86.c | 42 +++++++++++++++++++++----------------- src/port/pg_crc32c_sse42.c | 2 +- 2 files changed, 24 insertions(+), 20 deletions(-) diff --git a/src/port/pg_cpu_x86.c b/src/port/pg_cpu_x86.c index 725bd1f68c8..9b462b186b8 100644 --- a/src/port/pg_cpu_x86.c +++ b/src/port/pg_cpu_x86.c @@ -31,31 +31,28 @@ #include "port/pg_cpu.h" +/* XSAVE state component bits that we need */ +#define XMM (1<<1) +#define YMM (1<<2) +#define OPMASK (1<<5) +#define ZMM0_15 (1<<6) +#define ZMM16_31 (1<<7) + bool X86Features[X86FeaturesSize] = {0}; -/* - * Does XGETBV say the ZMM registers are enabled? - * - * NB: Caller is responsible for verifying that osxsave is available - * before calling this. - */ -#ifdef HAVE_XSAVE_INTRINSICS -pg_attribute_target("xsave") -#endif static bool -zmm_regs_available(void) +mask_available(uint32 value, uint32 mask) { -#ifdef HAVE_XSAVE_INTRINSICS - return (_xgetbv(0) & 0xe6) == 0xe6; -#else - return false; -#endif + return (value & mask) == mask; } /* * Parse the CPU ID info for runtime checks. */ +#ifdef HAVE_XSAVE_INTRINSICS +pg_attribute_target("xsave") +#endif void set_x86_features(void) { @@ -75,22 +72,29 @@ set_x86_features(void) /* All these features depend on OSXSAVE */ if (exx[2] & (1 << 27)) { - /* second cpuid call on leaf 7 to check extended AVX-512 support */ + uint32 xcr0_val = 0; + /* second cpuid call on leaf 7 to check extended AVX-512 support */ memset(exx, 0, 4 * sizeof(exx[0])); - #if defined(HAVE__GET_CPUID_COUNT) __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); #elif defined(HAVE__CPUIDEX) __cpuidex(exx, 7, 0); #endif - if (zmm_regs_available()) +#ifdef HAVE_XSAVE_INTRINSICS + /* get value of Extended Control Register */ + xcr0_val = _xgetbv(0); +#endif + + /* Are ZMM registeres enabled? */ + if (mask_available(xcr0_val, XMM | YMM | + OPMASK | ZMM0_15 | ZMM16_31)) { X86Features[PG_AVX512_BW] = exx[1] >> 30 & 1; X86Features[PG_AVX512_VL] = exx[1] >> 31 & 1; - X86Features[PG_VPCLMULQDQ] = exx[2] >> 10 & 1; + X86Features[PG_AVX512_VPCLMULQDQ] = exx[2] >> 10 & 1; X86Features[PG_AVX512_VPOPCNTDQ] = exx[2] >> 14 & 1; } } diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c index 2e740e12a7a..d1d9d74e5ab 100644 --- a/src/port/pg_crc32c_sse42.c +++ b/src/port/pg_crc32c_sse42.c @@ -179,7 +179,7 @@ pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) #ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK if (x86_feature_available(PG_AVX512_VL) && - x86_feature_available(PG_VPCLMULQDQ)) + x86_feature_available(PG_AVX512_VPCLMULQDQ)) pg_comp_crc32c = pg_comp_crc32c_avx512; #endif -- 2.53.0
From 3d186bc8be218c609ff9005d88ea6222b3af4445 Mon Sep 17 00:00:00 2001 From: John Naylor <[email protected]> Date: Sat, 14 Feb 2026 19:01:34 +0700 Subject: [PATCH v4 4/6] Enable autovectorizing page checksums with AVX2 where available We already rely on autovectorization for computing page checksums, but on x86 we can get about twice the performance by annotating pg_checksum_block() with function target attributes for AVX2, which uses 256-bit registers. Co-authored-by: Matthew Sterrett <[email protected]> Co-authored-by: Andrew Kim <[email protected]> Reviewed-by: Oleg Tselebrovskiy <[email protected]> Discussion: https://postgr.es/m/CA%2BvA85_5GTu%2BHHniSbvvP%2B8k3%3DxZO%3DWE84NPwiKyxztqvpfZ3Q%40mail.gmail.com Discussion: https://postgr.es/m/20250911054220.3784-1-root%40ip-172-31-36-228.ec2.internal --- config/c-compiler.m4 | 26 ++++++++++ configure | 46 ++++++++++++++++++ configure.ac | 9 ++++ meson.build | 30 ++++++++++++ src/backend/storage/page/checksum.c | 44 ++++++++++++++++- src/include/pg_config.h.in | 3 ++ src/include/port/pg_cpu.h | 3 ++ src/include/storage/checksum_block_internal.h | 42 ++++++++++++++++ src/include/storage/checksum_impl.h | 48 ++++++------------- src/port/pg_cpu_x86.c | 6 ++- src/tools/pginclude/headerscheck | 2 + 11 files changed, 224 insertions(+), 35 deletions(-) create mode 100644 src/include/storage/checksum_block_internal.h diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 1509dbfa2ab..1f3e31fc2d3 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -613,6 +613,32 @@ fi undefine([Ac_cachevar])dnl ])# PGAC_SSE42_CRC32_INTRINSICS +# PGAC_AVX2_SUPPORT +# --------------------------- +# Check if the compiler supports AVX2 target attribute. +# This is used for optimized checksum calculations with runtime detection. +# +# If AVX2 target attribute is supported, sets pgac_avx2_support. +AC_DEFUN([PGAC_AVX2_SUPPORT], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx2_support])])dnl +AC_CACHE_CHECK([for AVX2 target attribute support], [Ac_cachevar], +[AC_COMPILE_IFELSE([AC_LANG_PROGRAM([#include <stdint.h> + #if defined(__has_attribute) && __has_attribute (target) + __attribute__((target("avx2"))) + static int avx2_test(void) + { + return 0; + } + #endif], + [return avx2_test();])], + [Ac_cachevar=yes], + [Ac_cachevar=no])]) +if test x"$Ac_cachevar" = x"yes"; then + pgac_avx2_support=yes +fi +undefine([Ac_cachevar])dnl +])# PGAC_AVX2_SUPPORT + # PGAC_AVX512_PCLMUL_INTRINSICS # --------------------------- # Check if the compiler supports AVX-512 carryless multiplication diff --git a/configure b/configure index 185703289b4..2d2c6308005 100755 --- a/configure +++ b/configure @@ -17718,6 +17718,52 @@ $as_echo "#define HAVE__CPUIDEX 1" >>confdefs.h fi fi +# Check for AVX2 target and intrinsic support +# +if test x"$host_cpu" = x"x86_64"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for AVX2 target attribute support" >&5 +$as_echo_n "checking for AVX2 target attribute support... " >&6; } +if ${pgac_cv_avx2_support+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include <stdint.h> + #if defined(__has_attribute) && __has_attribute (target) + __attribute__((target("avx2"))) + static int avx2_test(void) + { + return 0; + } + #endif +int +main () +{ +return avx2_test(); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + pgac_cv_avx2_support=yes +else + pgac_cv_avx2_support=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx2_support" >&5 +$as_echo "$pgac_cv_avx2_support" >&6; } +if test x"$pgac_cv_avx2_support" = x"yes"; then + pgac_avx2_support=yes +fi + + if test x"$pgac_avx2_support" = x"yes"; then + +$as_echo "#define USE_AVX2_WITH_RUNTIME_CHECK 1" >>confdefs.h + + fi +fi + # Check for XSAVE intrinsics # { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _xgetbv" >&5 diff --git a/configure.ac b/configure.ac index 0955b7e4371..0b4c3970b68 100644 --- a/configure.ac +++ b/configure.ac @@ -2122,6 +2122,15 @@ else fi fi +# Check for AVX2 target and intrinsic support +# +if test x"$host_cpu" = x"x86_64"; then + PGAC_AVX2_SUPPORT() + if test x"$pgac_avx2_support" = x"yes"; then + AC_DEFINE(USE_AVX2_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX2 instructions with a runtime check.]) + fi +fi + # Check for XSAVE intrinsics # PGAC_XSAVE_INTRINSICS() diff --git a/meson.build b/meson.build index f6d5842d852..feea3658ff3 100644 --- a/meson.build +++ b/meson.build @@ -2377,6 +2377,36 @@ int main(void) endif +############################################################### +# Check for the availability of AVX2 support +############################################################### + +if host_cpu == 'x86_64' + + prog = ''' +#include <immintrin.h> +#include <stdint.h> +#if defined(__has_attribute) && __has_attribute (target) +__attribute__((target("avx2"))) +#endif +static int avx2_test(void) +{ + return 0; +} + +int main(void) +{ + return avx2_test(); +} +''' + + if cc.links(prog, name: 'AVX2 support', args: test_c_args) + cdata.set('USE_AVX2_WITH_RUNTIME_CHECK', 1) + endif + +endif + + ############################################################### # Check for the availability of AVX-512 popcount intrinsics. ############################################################### diff --git a/src/backend/storage/page/checksum.c b/src/backend/storage/page/checksum.c index 8716651c8b5..030c44f7308 100644 --- a/src/backend/storage/page/checksum.c +++ b/src/backend/storage/page/checksum.c @@ -13,10 +13,52 @@ */ #include "postgres.h" +#include "port/pg_cpu.h" #include "storage/checksum.h" /* * The actual code is in storage/checksum_impl.h. This is done so that * external programs can incorporate the checksum code by #include'ing - * that file from the exported Postgres headers. (Compare our CRC code.) + * that file from the exported Postgres headers. (Compare our legacy + * CRC code in pg_crc.h.) + * The PG_CHECKSUM_INTERNAL symbol allows core to use hardware-specific + * coding without affecting external programs. */ +#define PG_CHECKSUM_INTERNAL #include "storage/checksum_impl.h" /* IWYU pragma: keep */ + + +static uint32 +pg_checksum_block_fallback(const PGChecksummablePage *page) +{ +#include "storage/checksum_block_internal.h" +} + +/* + * AVX2-optimized block checksum algorithm. + */ +#ifdef USE_AVX2_WITH_RUNTIME_CHECK +pg_attribute_target("avx2") +static uint32 +pg_checksum_block_avx2(const PGChecksummablePage *page) +{ +#include "storage/checksum_block_internal.h" +} +#endif /* USE_AVX2_WITH_RUNTIME_CHECK */ + +/* + * Choose the best available checksum implementation. + */ +static uint32 +pg_checksum_choose(const PGChecksummablePage *page) +{ + pg_checksum_block = pg_checksum_block_fallback; + +#ifdef USE_AVX2_WITH_RUNTIME_CHECK + if (x86_feature_available(PG_AVX2)) + pg_checksum_block = pg_checksum_block_avx2; +#endif + + return pg_checksum_block(page); +} + +static uint32 (*pg_checksum_block) (const PGChecksummablePage *page) = pg_checksum_choose; diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 339268dc8ef..1e43e9b2bc4 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -665,6 +665,9 @@ /* Define to 1 to use AVX-512 CRC algorithms with a runtime check. */ #undef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK +/* Define to 1 to use AVX2 instructions with a runtime check. */ +#undef USE_AVX2_WITH_RUNTIME_CHECK + /* Define to 1 to use AVX-512 popcount instructions with a runtime check. */ #undef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK diff --git a/src/include/port/pg_cpu.h b/src/include/port/pg_cpu.h index c9e03d016e9..3c70fd43a23 100644 --- a/src/include/port/pg_cpu.h +++ b/src/include/port/pg_cpu.h @@ -24,6 +24,9 @@ typedef enum X86FeatureId PG_SSE4_2, PG_POPCNT, + /* 256-bit registers */ + PG_AVX2, + /* 512-bit registers */ PG_AVX512_BW, PG_AVX512_VL, diff --git a/src/include/storage/checksum_block_internal.h b/src/include/storage/checksum_block_internal.h new file mode 100644 index 00000000000..b4e6987d6b5 --- /dev/null +++ b/src/include/storage/checksum_block_internal.h @@ -0,0 +1,42 @@ +/*------------------------------------------------------------------------- + * + * checksum_block_internal.h + * Core algorithm for page checksums , semi private to checksum_impl.h + * and checksum.c. + * + * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/checksum_block_internal.h + * + *------------------------------------------------------------------------- + */ + +/* there is deliberately not an #ifndef CHECKSUM_BLOCK_INTERNAL_H here */ + +uint32 sums[N_SUMS]; +uint32 result = 0; +uint32 i, + j; + +/* ensure that the size is compatible with the algorithm */ +Assert(sizeof(PGChecksummablePage) == BLCKSZ); + +/* initialize partial checksums to their corresponding offsets */ +memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets)); + +/* main checksum calculation */ +for (i = 0; i < (uint32) (BLCKSZ / (sizeof(uint32) * N_SUMS)); i++) + for (j = 0; j < N_SUMS; j++) + CHECKSUM_COMP(sums[j], page->data[i][j]); + +/* finally add in two rounds of zeroes for additional mixing */ +for (i = 0; i < 2; i++) + for (j = 0; j < N_SUMS; j++) + CHECKSUM_COMP(sums[j], 0); + +/* xor fold partial checksums together */ +for (i = 0; i < N_SUMS; i++) + result ^= sums[i]; + +return result; diff --git a/src/include/storage/checksum_impl.h b/src/include/storage/checksum_impl.h index 5c2dcbc63e7..8a308e423c3 100644 --- a/src/include/storage/checksum_impl.h +++ b/src/include/storage/checksum_impl.h @@ -73,11 +73,10 @@ * 2e-16 false positive rate within margin of error. * * Vectorization of the algorithm requires 32bit x 32bit -> 32bit integer - * multiplication instruction. As of 2013 the corresponding instruction is - * available on x86 SSE4.1 extensions (pmulld) and ARM NEON (vmul.i32). - * Vectorization requires a compiler to do the vectorization for us. For recent - * GCC versions the flags -msse4.1 -funroll-loops -ftree-vectorize are enough - * to achieve vectorization. + * multiplication instruction. Examples include x86 AVX2 extensions (vpmulld) + * and ARM NEON (vmul.i32). For simplicity we rely on the compiler to do the + * vectorization for us. For GCC and clang the flags -funroll-loops + * -ftree-vectorize are enough to achieve vectorization. * * The optimal amount of parallelism to use depends on CPU specific instruction * latency, SIMD instruction width, throughput and the amount of registers @@ -89,8 +88,9 @@ * * The parallelism number 32 was chosen based on the fact that it is the * largest state that fits into architecturally visible x86 SSE registers while - * leaving some free registers for intermediate values. For future processors - * with 256bit vector registers this will leave some performance on the table. + * leaving some free registers for intermediate values. For processors + * with 256bit vector registers this leaves some performance on the table. + * * When vectorization is not available it might be beneficial to restructure * the computation to calculate a subset of the columns at a time and perform * multiple passes to avoid register spilling. This optimization opportunity @@ -138,6 +138,9 @@ do { \ (checksum) = __tmp * FNV_PRIME ^ (__tmp >> 17); \ } while (0) +/* Provide a static definition for external programs */ +#ifndef PG_CHECKSUM_INTERNAL + /* * Block checksum algorithm. The page must be adequately aligned * (at least on 4-byte boundary). @@ -145,34 +148,13 @@ do { \ static uint32 pg_checksum_block(const PGChecksummablePage *page) { - uint32 sums[N_SUMS]; - uint32 result = 0; - uint32 i, - j; - - /* ensure that the size is compatible with the algorithm */ - Assert(sizeof(PGChecksummablePage) == BLCKSZ); - - /* initialize partial checksums to their corresponding offsets */ - memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets)); - - /* main checksum calculation */ - for (i = 0; i < (uint32) (BLCKSZ / (sizeof(uint32) * N_SUMS)); i++) - for (j = 0; j < N_SUMS; j++) - CHECKSUM_COMP(sums[j], page->data[i][j]); - - /* finally add in two rounds of zeroes for additional mixing */ - for (i = 0; i < 2; i++) - for (j = 0; j < N_SUMS; j++) - CHECKSUM_COMP(sums[j], 0); - - /* xor fold partial checksums together */ - for (i = 0; i < N_SUMS; i++) - result ^= sums[i]; - - return result; +#include "storage/checksum_block_internal.h" } +#else +static uint32 (*pg_checksum_block) (const PGChecksummablePage *page); +#endif + /* * Compute the checksum for a Postgres page. * diff --git a/src/port/pg_cpu_x86.c b/src/port/pg_cpu_x86.c index 9b462b186b8..a48c39ade98 100644 --- a/src/port/pg_cpu_x86.c +++ b/src/port/pg_cpu_x86.c @@ -74,7 +74,7 @@ set_x86_features(void) { uint32 xcr0_val = 0; - /* second cpuid call on leaf 7 to check extended AVX-512 support */ + /* second cpuid call on leaf 7 to check extended support */ memset(exx, 0, 4 * sizeof(exx[0])); #if defined(HAVE__GET_CPUID_COUNT) __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); @@ -87,6 +87,10 @@ set_x86_features(void) xcr0_val = _xgetbv(0); #endif + /* Are YMM registers enabled? */ + if (mask_available(xcr0_val, XMM | YMM)) + X86Features[PG_AVX2] = exx[1] >> 5 & 1; + /* Are ZMM registeres enabled? */ if (mask_available(xcr0_val, XMM | YMM | OPMASK | ZMM0_15 | ZMM16_31)) diff --git a/src/tools/pginclude/headerscheck b/src/tools/pginclude/headerscheck index 7a6755991bb..569e749b25a 100755 --- a/src/tools/pginclude/headerscheck +++ b/src/tools/pginclude/headerscheck @@ -154,6 +154,8 @@ do test "$f" = src/include/catalog/syscache_ids.h && continue test "$f" = src/include/catalog/syscache_info.h && continue + test "$f" = src/include/storage/checksum_block_internal.h && continue + # We can't make these Bison output files compilable standalone # without using "%code require", which old Bison versions lack. # parser/gram.h will be included by parser/gramparse.h anyway. -- 2.53.0
From 58d1b495afc0791b6ab4dfaa421cee2bc09d5359 Mon Sep 17 00:00:00 2001 From: John Naylor <[email protected]> Date: Thu, 19 Feb 2026 15:04:35 +0700 Subject: [PATCH v4 5/6] Rename CRC Arm-v8 "choose" file for future general purpose --- configure | 2 +- configure.ac | 2 +- src/port/Makefile | 1 + src/port/meson.build | 1 + src/port/{pg_crc32c_armv8_choose.c => pg_cpu_armv8.c} | 8 ++++++-- 5 files changed, 10 insertions(+), 4 deletions(-) rename src/port/{pg_crc32c_armv8_choose.c => pg_cpu_armv8.c} (95%) diff --git a/configure b/configure index 2d2c6308005..ec76278f9c1 100755 --- a/configure +++ b/configure @@ -18266,7 +18266,7 @@ $as_echo "ARMv8 CRC instructions" >&6; } $as_echo "#define USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o" + PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o" { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions with runtime check" >&5 $as_echo "ARMv8 CRC instructions with runtime check" >&6; } else diff --git a/configure.ac b/configure.ac index 0b4c3970b68..5a3971bad63 100644 --- a/configure.ac +++ b/configure.ac @@ -2269,7 +2269,7 @@ else else if test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then AC_DEFINE(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use ARMv8 CRC Extension with a runtime check.]) - PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o" + PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o" AC_MSG_RESULT(ARMv8 CRC instructions with runtime check) else if test x"$USE_LOONGARCH_CRC32C" = x"1"; then diff --git a/src/port/Makefile b/src/port/Makefile index 47cfea1507d..4ed7e1902fb 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -44,6 +44,7 @@ OBJS = \ noblock.o \ path.o \ pg_bitutils.o \ + pg_cpu_armv8.o \ pg_cpu_x86.o \ pg_localeconv_r.o \ pg_numa.o \ diff --git a/src/port/meson.build b/src/port/meson.build index edb2e5632bd..0b26218be7a 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -7,6 +7,7 @@ pgport_sources = [ 'noblock.c', 'path.c', 'pg_bitutils.c', + 'pg_cpu_armv8.c', 'pg_cpu_x86.c', 'pg_localeconv_r.c', 'pg_numa.c', diff --git a/src/port/pg_crc32c_armv8_choose.c b/src/port/pg_cpu_armv8.c similarity index 95% rename from src/port/pg_crc32c_armv8_choose.c rename to src/port/pg_cpu_armv8.c index a1f0e540c6b..6c22704b5fa 100644 --- a/src/port/pg_crc32c_armv8_choose.c +++ b/src/port/pg_cpu_armv8.c @@ -1,6 +1,6 @@ /*------------------------------------------------------------------------- * - * pg_crc32c_armv8_choose.c + * pg_cpu_armv8.c * Choose between ARMv8 and software CRC-32C implementation. * * On first call, checks if the CPU we're running on supports the ARMv8 @@ -13,7 +13,7 @@ * * * IDENTIFICATION - * src/port/pg_crc32c_armv8_choose.c + * src/port/pg_cpu_armv8.c * *------------------------------------------------------------------------- */ @@ -24,6 +24,8 @@ #include "postgres_fe.h" #endif +#if defined(__arm__) || defined(__arm) || defined(__aarch64__) + #if defined(HAVE_ELF_AUX_INFO) || defined(HAVE_GETAUXVAL) #include <sys/auxv.h> /* Ancient glibc releases don't include the HWCAPxxx macros in sys/auxv.h */ @@ -124,3 +126,5 @@ pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) } pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose; + +#endif /* __arm__ || __arm || __aarch64__ */ -- 2.53.0
From a3e9a1302d710f9fa6c48594144a82fe0c3988d6 Mon Sep 17 00:00:00 2001 From: John Naylor <[email protected]> Date: Thu, 19 Feb 2026 18:27:02 +0700 Subject: [PATCH v4 6/6] Centralize detection of CPU features Arm, take 1 --- src/include/port/pg_cpu.h | 25 ++++++++++++ src/port/pg_cpu_armv8.c | 74 +++++++++++++++------------------- src/port/pg_crc32c_armv8.c | 20 +++++++++ src/port/pg_popcount_aarch64.c | 26 +----------- 4 files changed, 80 insertions(+), 65 deletions(-) diff --git a/src/include/port/pg_cpu.h b/src/include/port/pg_cpu.h index 3c70fd43a23..3687e025083 100644 --- a/src/include/port/pg_cpu.h +++ b/src/include/port/pg_cpu.h @@ -48,6 +48,31 @@ x86_feature_available(X86FeatureId feature) return X86Features[feature]; } +#elif defined(__arm__) || defined(__arm) || defined(__aarch64__) + +typedef enum ArmFeatureId +{ + /* Have we run feature detection? */ + INIT_PG_ARM, + + PG_ARM_CRC32, + PG_ARM_SVE, +} ArmFeatureId; +#define ArmFeaturesSize (PG_ARM_SVE + 1) + +extern PGDLLIMPORT bool ArmFeatures[]; + +extern void set_arm_features(void); + +static inline bool +arm_feature_available(ArmFeatureId feature) +{ + if (ArmFeatures[INIT_PG_ARM] == false) + set_arm_features(); + + return ArmFeatures[feature]; +} + #endif /* defined(USE_SSE2) || defined(__i386__) */ #endif /* PG_CPU_H */ diff --git a/src/port/pg_cpu_armv8.c b/src/port/pg_cpu_armv8.c index 6c22704b5fa..59a9229b71d 100644 --- a/src/port/pg_cpu_armv8.c +++ b/src/port/pg_cpu_armv8.c @@ -1,12 +1,7 @@ /*------------------------------------------------------------------------- * * pg_cpu_armv8.c - * Choose between ARMv8 and software CRC-32C implementation. - * - * On first call, checks if the CPU we're running on supports the ARMv8 - * CRC Extension. If it does, use the special instructions for CRC-32C - * computation. Otherwise, fall back to the pure software implementation - * (slicing-by-8). + * Runtime CPU feature detection for Arm-v8 * * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -41,27 +36,41 @@ #endif #endif -#include "port/pg_crc32c.h" +#include "port/pg_cpu.h" + + +bool ArmFeatures[ArmFeaturesSize] = {0}; -static bool -pg_crc32c_armv8_available(void) +static inline unsigned long +pg_getauxval(unsigned long at_hwcap) { #if defined(HAVE_ELF_AUX_INFO) unsigned long value; -#ifdef __aarch64__ - return elf_aux_info(AT_HWCAP, &value, sizeof(value)) == 0 && - (value & HWCAP_CRC32) != 0; -#else - return elf_aux_info(AT_HWCAP2, &value, sizeof(value)) == 0 && - (value & HWCAP2_CRC32) != 0; -#endif + if (elf_aux_info(at_hwcap, &value, sizeof(value)) == 0) + return value; + else + return 0; #elif defined(HAVE_GETAUXVAL) + return getauxval(at_hwcap); +#endif +} + +void +set_arm_features(void) +{ +#if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO #ifdef __aarch64__ - return (getauxval(AT_HWCAP) & HWCAP_CRC32) != 0; + unsigned long hwcap = pg_getauxval(AT_HWCAP); + + ArmFeatures[PG_ARM_CRC32] = (hwcap & HWCAP_CRC32) != 0; + ArmFeatures[PG_ARM_SVE] = (hwcap & HWCAP_SVE) != 0; #else - return (getauxval(AT_HWCAP2) & HWCAP2_CRC32) != 0; + unsigned long hwcap2 = pg_getauxval(AT_HWCAP2); + + ArmFeatures[PG_ARM_CRC32] = (hwcap2 & HWCAP2_CRC32) != 0; #endif + #elif defined(__NetBSD__) /* * On NetBSD we can read the Instruction Set Attribute Registers via @@ -92,9 +101,9 @@ pg_crc32c_armv8_available(void) len = sizeof(sysctlbuf); memset(sysctlbuf, 0, len); if (sysctlbyname(path, sysctlbuf, &len, NULL, 0) != 0) - return false; /* perhaps kernel is 64-bit and we aren't? */ + return; /* perhaps kernel is 64-bit and we aren't? */ if (len != expected_len) - return false; /* kernel API change? */ + return; /* kernel API change? */ /* Fetch the CRC32 field from ISAR0. */ fld = (ISAR0 >> ISAR0_CRC32_BITPOS) & WIDTHMASK(ISAR0_CRC32_BITWIDTH); @@ -104,27 +113,10 @@ pg_crc32c_armv8_available(void) * (CRC32B/CRC32H/CRC32W/CRC32X/CRC32CB/CRC32CH/CRC32CW/CRC32CX). Assume * that any future nonzero value will be a superset of 1. */ - return (fld != 0); -#else - return false; -#endif -} + ArmFeatures[PG_ARM_CRC32] = (fld != 0); +#endif /* __NetBSD__ */ -/* - * This gets called on the first call. It replaces the function pointer - * so that subsequent calls are routed directly to the chosen implementation. - */ -static pg_crc32c -pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) -{ - if (pg_crc32c_armv8_available()) - pg_comp_crc32c = pg_comp_crc32c_armv8; - else - pg_comp_crc32c = pg_comp_crc32c_sb8; - - return pg_comp_crc32c(crc, data, len); + ArmFeatures[INIT_PG_ARM] = true; } -pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose; - -#endif /* __arm__ || __arm || __aarch64__ */ +#endif /* __arm__ || __arm || __aarch64__ */ diff --git a/src/port/pg_crc32c_armv8.c b/src/port/pg_crc32c_armv8.c index 9ca0f728d39..a02264a00dc 100644 --- a/src/port/pg_crc32c_armv8.c +++ b/src/port/pg_crc32c_armv8.c @@ -20,8 +20,11 @@ #include <arm_acle.h> #endif +#include "port/pg_cpu.h" #include "port/pg_crc32c.h" +static pg_crc32c pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len); + pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len) { @@ -77,3 +80,20 @@ pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len) return crc; } + +/* + * This gets called on the first call. It replaces the function pointer + * so that subsequent calls are routed directly to the chosen implementation. + */ +static pg_crc32c +pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) +{ + if (arm_feature_available(PG_ARM_CRC32)) + pg_comp_crc32c = pg_comp_crc32c_armv8; + else + pg_comp_crc32c = pg_comp_crc32c_sb8; + + return pg_comp_crc32c(crc, data, len); +} + +pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose; diff --git a/src/port/pg_popcount_aarch64.c b/src/port/pg_popcount_aarch64.c index f474ef45510..37adff67ce8 100644 --- a/src/port/pg_popcount_aarch64.c +++ b/src/port/pg_popcount_aarch64.c @@ -18,17 +18,10 @@ #ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK #include <arm_sve.h> - -#if defined(HAVE_ELF_AUX_INFO) || defined(HAVE_GETAUXVAL) -#include <sys/auxv.h> -/* Ancient glibc releases don't include the HWCAPxxx macros in sys/auxv.h */ -#if defined(__linux__) && !defined(HWCAP_SVE) -#include <asm/hwcap.h> -#endif -#endif #endif #include "port/pg_bitutils.h" +#include "port/pg_cpu.h" /* * The Neon versions are built regardless of whether we are building the SVE @@ -56,25 +49,10 @@ static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask); uint64 (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose; uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose; -static inline bool -pg_popcount_sve_available(void) -{ -#ifdef HAVE_ELF_AUX_INFO - unsigned long value; - - return elf_aux_info(AT_HWCAP, &value, sizeof(value)) == 0 && - (value & HWCAP_SVE) != 0; -#elif defined(HAVE_GETAUXVAL) - return (getauxval(AT_HWCAP) & HWCAP_SVE) != 0; -#else - return false; -#endif -} - static inline void choose_popcount_functions(void) { - if (pg_popcount_sve_available()) + if (arm_feature_available(PG_ARM_SVE)) { pg_popcount_optimized = pg_popcount_sve; pg_popcount_masked_optimized = pg_popcount_masked_sve; -- 2.53.0
