On 12/7/2024 12:42 AM, Devulapalli, Raghuveer wrote:
[0] https://cirrus-ci.com/task/6023394207989760
[1] https://cirrus-ci.com/task/5460444254568448
[2] https://cirrus-ci.com/task/6586344161411072
I was able to fix [0] and [1], but I can't think of why [2] fails. When I tried
to reproduce this locally, I get a different unrelated error. Any idea why I am
seeing this?
LINK : fatal error LNK1181: cannot open input file 'C:\Program Files\Git\nologo'
Commands: meson setup build && cd build && meson compile
Hello! I'm Matthew Sterrett and I'm a coworker of Raghuveer; he asked me
to look into the Windows build failures related to pg_comp_crc32c.
It seems that the only thing that was required to fix that is to mark
pg_comp_crc32c as PGDLLIMPORT, so I added a patch that does just that.
I'm new to working with mailing lists, so please tell me if I messed
anything up!
Matthew Sterrett
From 74d085d44d41af8ffb01f7bf2377ac487c7d4cc1 Mon Sep 17 00:00:00 2001
From: Paul Amonson <paul.d.amon...@intel.com>
Date: Mon, 6 May 2024 08:34:17 -0700
Subject: [PATCH v10 1/4] Add a Postgres SQL function for crc32c benchmarking.
Add a drive_crc32c() function to use for benchmarking crc32c
computation. The function takes 2 arguments:
(1) count: num of times CRC32C is computed in a loop.
(2) num: #bytes in the buffer to calculate crc over.
Signed-off-by: Paul Amonson <paul.d.amon...@intel.com>
Signed-off-by: Raghuveer Devulapalli <raghuveer.devulapa...@intel.com>
---
src/test/modules/meson.build | 1 +
src/test/modules/test_crc32c/Makefile | 20 ++++++++
src/test/modules/test_crc32c/meson.build | 22 +++++++++
.../modules/test_crc32c/test_crc32c--1.0.sql | 1 +
src/test/modules/test_crc32c/test_crc32c.c | 47 +++++++++++++++++++
.../modules/test_crc32c/test_crc32c.control | 4 ++
6 files changed, 95 insertions(+)
create mode 100644 src/test/modules/test_crc32c/Makefile
create mode 100644 src/test/modules/test_crc32c/meson.build
create mode 100644 src/test/modules/test_crc32c/test_crc32c--1.0.sql
create mode 100644 src/test/modules/test_crc32c/test_crc32c.c
create mode 100644 src/test/modules/test_crc32c/test_crc32c.control
diff --git a/src/test/modules/meson.build b/src/test/modules/meson.build
index c829b61953..68d8904dd0 100644
--- a/src/test/modules/meson.build
+++ b/src/test/modules/meson.build
@@ -15,6 +15,7 @@ subdir('ssl_passphrase_callback')
subdir('test_bloomfilter')
subdir('test_copy_callbacks')
subdir('test_custom_rmgrs')
+subdir('test_crc32c')
subdir('test_ddl_deparse')
subdir('test_dsa')
subdir('test_dsm_registry')
diff --git a/src/test/modules/test_crc32c/Makefile
b/src/test/modules/test_crc32c/Makefile
new file mode 100644
index 0000000000..5b747c6184
--- /dev/null
+++ b/src/test/modules/test_crc32c/Makefile
@@ -0,0 +1,20 @@
+MODULE_big = test_crc32c
+OBJS = test_crc32c.o
+PGFILEDESC = "test"
+EXTENSION = test_crc32c
+DATA = test_crc32c--1.0.sql
+
+first: all
+
+# test_crc32c.o: CFLAGS+=-g
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/test_crc32c
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/src/test/modules/test_crc32c/meson.build
b/src/test/modules/test_crc32c/meson.build
new file mode 100644
index 0000000000..7021a6d6cf
--- /dev/null
+++ b/src/test/modules/test_crc32c/meson.build
@@ -0,0 +1,22 @@
+# Copyright (c) 2022-2024, PostgreSQL Global Development Group
+
+test_crc32c_sources = files(
+ 'test_crc32c.c',
+)
+
+if host_system == 'windows'
+ test_crc32c_sources += rc_lib_gen.process(win32ver_rc, extra_args: [
+ '--NAME', 'test_crc32c',
+ '--FILEDESC', 'test_crc32c - test code for crc32c library',])
+endif
+
+test_crc32c = shared_module('test_crc32c',
+ test_crc32c_sources,
+ kwargs: pg_test_mod_args,
+)
+test_install_libs += test_crc32c
+
+test_install_data += files(
+ 'test_crc32c.control',
+ 'test_crc32c--1.0.sql',
+)
diff --git a/src/test/modules/test_crc32c/test_crc32c--1.0.sql
b/src/test/modules/test_crc32c/test_crc32c--1.0.sql
new file mode 100644
index 0000000000..32f8f0fb2e
--- /dev/null
+++ b/src/test/modules/test_crc32c/test_crc32c--1.0.sql
@@ -0,0 +1 @@
+CREATE FUNCTION drive_crc32c (count int, num int) RETURNS bigint AS
'test_crc32c.so' LANGUAGE C;
diff --git a/src/test/modules/test_crc32c/test_crc32c.c
b/src/test/modules/test_crc32c/test_crc32c.c
new file mode 100644
index 0000000000..b350caf5ce
--- /dev/null
+++ b/src/test/modules/test_crc32c/test_crc32c.c
@@ -0,0 +1,47 @@
+/* select drive_crc32c(1000000, 1024); */
+
+#include "postgres.h"
+#include "fmgr.h"
+#include "port/pg_crc32c.h"
+#include "common/pg_prng.h"
+
+PG_MODULE_MAGIC;
+
+/*
+ * drive_crc32c(count: int, num: int) returns bigint
+ *
+ * count is the nuimber of loops to perform
+ *
+ * num is the number byte in the buffer to calculate
+ * crc32c over.
+ */
+PG_FUNCTION_INFO_V1(drive_crc32c);
+Datum
+drive_crc32c(PG_FUNCTION_ARGS)
+{
+ int64 count = PG_GETARG_INT64(0);
+ int64 num = PG_GETARG_INT64(1);
+ char* data = malloc((size_t)num);
+ pg_crc32c crc;
+ pg_prng_state state;
+ uint64 seed = 42;
+ pg_prng_seed(&state, seed);
+ /* set random data */
+ for (uint64 i = 0; i < num; i++)
+ {
+ data[i] = pg_prng_uint32(&state) % 255;
+ }
+
+ INIT_CRC32C(crc);
+
+ while(count--)
+ {
+ INIT_CRC32C(crc);
+ COMP_CRC32C(crc, data, num);
+ FIN_CRC32C(crc);
+ }
+
+ free((void *)data);
+
+ PG_RETURN_INT64((int64_t)crc);
+}
diff --git a/src/test/modules/test_crc32c/test_crc32c.control
b/src/test/modules/test_crc32c/test_crc32c.control
new file mode 100644
index 0000000000..878a077ee1
--- /dev/null
+++ b/src/test/modules/test_crc32c/test_crc32c.control
@@ -0,0 +1,4 @@
+comment = 'test'
+default_version = '1.0'
+module_pathname = '$libdir/test_crc32c'
+relocatable = true
--
2.34.1
From 2542c6830d98e146d79844fb84fe3fb1b2945c25 Mon Sep 17 00:00:00 2001
From: Paul Amonson <paul.d.amon...@intel.com>
Date: Tue, 23 Jul 2024 11:23:23 -0700
Subject: [PATCH v10 2/4] Refactor: consolidate x86 ISA and OS runtime checks
Move all x86 ISA and OS runtime checks into a single file for improved
modularity and easier future maintenance.
Signed-off-by: Paul Amonson <paul.d.amon...@intel.com>
Signed-off-by: Raghuveer Devulapalli <raghuveer.devulapa...@intel.com>
---
src/include/port/pg_bitutils.h | 1 -
src/include/port/pg_hw_feat_check.h | 33 ++++++
src/port/Makefile | 1 +
src/port/meson.build | 3 +
src/port/pg_bitutils.c | 22 +---
src/port/pg_crc32c_sse42_choose.c | 21 +---
src/port/pg_hw_feat_check.c | 163 ++++++++++++++++++++++++++++
src/port/pg_popcount_avx512.c | 78 -------------
8 files changed, 205 insertions(+), 117 deletions(-)
create mode 100644 src/include/port/pg_hw_feat_check.h
create mode 100644 src/port/pg_hw_feat_check.c
diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index a3cad46afe..461c7c13cf 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -312,7 +312,6 @@ extern PGDLLIMPORT uint64 (*pg_popcount_masked_optimized)
(const char *buf, int
* files.
*/
#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
-extern bool pg_popcount_avx512_available(void);
extern uint64 pg_popcount_avx512(const char *buf, int bytes);
extern uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8
mask);
#endif
diff --git a/src/include/port/pg_hw_feat_check.h
b/src/include/port/pg_hw_feat_check.h
new file mode 100644
index 0000000000..58be900b54
--- /dev/null
+++ b/src/include/port/pg_hw_feat_check.h
@@ -0,0 +1,33 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_hw_feat_check.h
+ * Miscellaneous functions for cheing for hardware features at runtime.
+ *
+ *
+ * Copyright (c) 2024, PostgreSQL Global Development Group
+ *
+ * src/include/port/pg_hw_feat_check.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_HW_FEAT_CHECK_H
+#define PG_HW_FEAT_CHECK_H
+
+/*
+ * Test to see if all hardware features required by SSE 4.2 crc32c (64 bit)
+ * are available.
+ */
+extern PGDLLIMPORT bool pg_crc32c_sse42_available(void);
+
+/*
+ * Test to see if all hardware features required by SSE 4.1 POPCNT (64 bit)
+ * are available.
+ */
+extern PGDLLIMPORT bool pg_popcount_available(void);
+
+/*
+ * Test to see if all hardware features required by AVX-512 POPCNT are
+ * available.
+ */
+extern PGDLLIMPORT bool pg_popcount_avx512_available(void);
+#endif /* PG_HW_FEAT_CHECK_H */
diff --git a/src/port/Makefile b/src/port/Makefile
index 4c22431951..6088b56b71 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -45,6 +45,7 @@ OBJS = \
path.o \
pg_bitutils.o \
pg_popcount_avx512.o \
+ pg_hw_feat_check.o \
pg_strong_random.o \
pgcheckdir.o \
pgmkdirp.o \
diff --git a/src/port/meson.build b/src/port/meson.build
index c5bceed9cd..ec28590473 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -8,6 +8,9 @@ pgport_sources = [
'path.c',
'pg_bitutils.c',
'pg_popcount_avx512.c',
+ 'pg_crc32c_sse42_choose.c',
+ 'pg_crc32c_sse42.c',
+ 'pg_hw_feat_check.c',
'pg_strong_random.c',
'pgcheckdir.c',
'pgmkdirp.c',
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index c8399981ee..c11b13dca2 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -20,7 +20,7 @@
#endif
#include "port/pg_bitutils.h"
-
+#include "port/pg_hw_feat_check.h"
/*
* Array giving the position of the left-most set bit for each possible
@@ -109,7 +109,6 @@ static uint64 pg_popcount_slow(const char *buf, int bytes);
static uint64 pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask);
#ifdef TRY_POPCNT_FAST
-static bool pg_popcount_available(void);
static int pg_popcount32_choose(uint32 word);
static int pg_popcount64_choose(uint64 word);
static uint64 pg_popcount_choose(const char *buf, int bytes);
@@ -127,25 +126,6 @@ uint64 (*pg_popcount_masked_optimized) (const
char *buf, int bytes, bits8 mask)
#ifdef TRY_POPCNT_FAST
-/*
- * Return true if CPUID indicates that the POPCNT instruction is available.
- */
-static bool
-pg_popcount_available(void)
-{
- unsigned int exx[4] = {0, 0, 0, 0};
-
-#if defined(HAVE__GET_CPUID)
- __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
-#elif defined(HAVE__CPUID)
- __cpuid(exx, 1);
-#else
-#error cpuid instruction not available
-#endif
-
- return (exx[2] & (1 << 23)) != 0; /* POPCNT */
-}
-
/*
* These functions get called on the first call to pg_popcount32 etc.
* They detect whether we can use the asm implementations, and replace
diff --git a/src/port/pg_crc32c_sse42_choose.c
b/src/port/pg_crc32c_sse42_choose.c
index 56d600f3a9..c659917af0 100644
--- a/src/port/pg_crc32c_sse42_choose.c
+++ b/src/port/pg_crc32c_sse42_choose.c
@@ -20,6 +20,7 @@
#include "c.h"
+#if defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK)
#ifdef HAVE__GET_CPUID
#include <cpuid.h>
#endif
@@ -29,22 +30,7 @@
#endif
#include "port/pg_crc32c.h"
-
-static bool
-pg_crc32c_sse42_available(void)
-{
- unsigned int exx[4] = {0, 0, 0, 0};
-
-#if defined(HAVE__GET_CPUID)
- __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
-#elif defined(HAVE__CPUID)
- __cpuid(exx, 1);
-#else
-#error cpuid instruction not available
-#endif
-
- return (exx[2] & (1 << 20)) != 0; /* SSE 4.2 */
-}
+#include "port/pg_hw_feat_check.h"
/*
* This gets called on the first call. It replaces the function pointer
@@ -61,4 +47,5 @@ pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t
len)
return pg_comp_crc32c(crc, data, len);
}
-pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len)
= pg_comp_crc32c_choose;
+pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) =
pg_comp_crc32c_choose;
+#endif
diff --git a/src/port/pg_hw_feat_check.c b/src/port/pg_hw_feat_check.c
new file mode 100644
index 0000000000..260aa60502
--- /dev/null
+++ b/src/port/pg_hw_feat_check.c
@@ -0,0 +1,163 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_hw_feat_check.c
+ * Test for hardware features at runtime on x86_64 platforms.
+ *
+ * Copyright (c) 2024, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/port/pg_hw_feat_check.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
+#include <cpuid.h>
+#endif
+
+#include <immintrin.h>
+
+#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
+#include <intrin.h>
+#endif
+
+#include "port/pg_hw_feat_check.h"
+
+/* Define names for EXX registers to avoid hard to see bugs in code below. */
+typedef unsigned int exx_t;
+typedef enum
+{
+ EAX = 0,
+ EBX = 1,
+ ECX = 2,
+ EDX = 3
+} reg_name;
+
+/*
+ * Helper function.
+ * Test for a bit being set in a exx_t register.
+ */
+inline static bool is_bit_set_in_exx(exx_t* regs, reg_name ex, int bit)
+{
+ return ((regs[ex] & (1 << bit)) != 0);
+}
+
+/*
+ * x86_64 Platform CPUID check for Linux and Visual Studio platforms.
+ */
+inline static void
+pg_getcpuid(unsigned int leaf, exx_t *exx)
+{
+#if defined(HAVE__GET_CPUID)
+ __get_cpuid(leaf, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+ __cpuid(exx, 1);
+#else
+#error cpuid instruction not available
+#endif
+}
+
+/*
+ * x86_64 Platform CPUIDEX check for Linux and Visual Studio platforms.
+ */
+inline static void
+pg_getcpuidex(unsigned int leaf, unsigned int subleaf, exx_t *exx)
+{
+#if defined(HAVE__GET_CPUID_COUNT)
+ __get_cpuid_count(leaf, subleaf, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUIDEX)
+ __cpuidex(exx, 7, 0);
+#else
+#error cpuid instruction not available
+#endif
+}
+
+/*
+ * Check for CPU support for CPUID: osxsave
+ */
+inline static bool
+osxsave_available(void)
+{
+#if defined(HAVE_XSAVE_INTRINSICS)
+ exx_t exx[4] = {0, 0, 0, 0};
+
+ pg_getcpuid(1, exx);
+
+ return is_bit_set_in_exx(exx, ECX, 27); /* osxsave */
+#else
+ return false;
+#endif
+}
+
+/*
+ * Does XGETBV say the ZMM registers are enabled?
+ *
+ * NB: Caller is responsible for verifying that osxsave_available() returns
true
+ * before calling this.
+ */
+#ifdef HAVE_XSAVE_INTRINSICS
+pg_attribute_target("xsave")
+#endif
+inline static bool
+zmm_regs_available(void)
+{
+#if defined(HAVE_XSAVE_INTRINSICS)
+ return (_xgetbv(0) & 0xe6) == 0xe6;
+#else
+ return false;
+#endif
+}
+
+/*
+ * Does CPUID say there's support for AVX-512 popcount and byte-and-word
+ * instructions?
+ */
+inline static bool
+avx512_popcnt_available(void)
+{
+ exx_t exx[4] = {0, 0, 0, 0};
+
+ pg_getcpuidex(7, 0, exx);
+
+ return is_bit_set_in_exx(exx, ECX, 14) && is_bit_set_in_exx(exx, EBX,
30);
+}
+
+/*
+ * Return true if CPUID indicates that the POPCNT instruction is available.
+ */
+bool PGDLLIMPORT pg_popcount_available(void)
+{
+ exx_t exx[4] = {0, 0, 0, 0};
+
+ pg_getcpuid(1, exx);
+
+ return is_bit_set_in_exx(exx, ECX, 23);
+ }
+
+ /*
+ * Returns true if the CPU supports the instructions required for the AVX-512
+ * pg_popcount() implementation.
+ *
+ * PA: The call to 'osxsave_available' MUST preceed the call to
+ * 'zmm_regs_available' function per NB above.
+ */
+bool PGDLLIMPORT pg_popcount_avx512_available(void)
+{
+ return osxsave_available() &&
+ zmm_regs_available() &&
+ avx512_popcnt_available();
+}
+
+/*
+ * Does CPUID say there's support for SSE 4.2?
+ */
+bool PGDLLIMPORT pg_crc32c_sse42_available(void)
+{
+ exx_t exx[4] = {0, 0, 0, 0};
+
+ pg_getcpuid(1, exx);
+
+ return is_bit_set_in_exx(exx, ECX, 20);
+}
+
diff --git a/src/port/pg_popcount_avx512.c b/src/port/pg_popcount_avx512.c
index c8a4f2b19f..1123a1a634 100644
--- a/src/port/pg_popcount_avx512.c
+++ b/src/port/pg_popcount_avx512.c
@@ -14,16 +14,7 @@
#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
-#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
-#include <cpuid.h>
-#endif
-
#include <immintrin.h>
-
-#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
-#include <intrin.h>
-#endif
-
#include "port/pg_bitutils.h"
/*
@@ -33,75 +24,6 @@
*/
#ifdef TRY_POPCNT_FAST
-/*
- * Does CPUID say there's support for XSAVE instructions?
- */
-static inline bool
-xsave_available(void)
-{
- unsigned int exx[4] = {0, 0, 0, 0};
-
-#if defined(HAVE__GET_CPUID)
- __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
-#elif defined(HAVE__CPUID)
- __cpuid(exx, 1);
-#else
-#error cpuid instruction not available
-#endif
- return (exx[2] & (1 << 27)) != 0; /* osxsave */
-}
-
-/*
- * Does XGETBV say the ZMM registers are enabled?
- *
- * NB: Caller is responsible for verifying that xsave_available() returns true
- * before calling this.
- */
-#ifdef HAVE_XSAVE_INTRINSICS
-pg_attribute_target("xsave")
-#endif
-static inline bool
-zmm_regs_available(void)
-{
-#ifdef HAVE_XSAVE_INTRINSICS
- return (_xgetbv(0) & 0xe6) == 0xe6;
-#else
- return false;
-#endif
-}
-
-/*
- * Does CPUID say there's support for AVX-512 popcount and byte-and-word
- * instructions?
- */
-static inline bool
-avx512_popcnt_available(void)
-{
- unsigned int exx[4] = {0, 0, 0, 0};
-
-#if defined(HAVE__GET_CPUID_COUNT)
- __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
-#elif defined(HAVE__CPUIDEX)
- __cpuidex(exx, 7, 0);
-#else
-#error cpuid instruction not available
-#endif
- return (exx[2] & (1 << 14)) != 0 && /* avx512-vpopcntdq */
- (exx[1] & (1 << 30)) != 0; /* avx512-bw */
-}
-
-/*
- * Returns true if the CPU supports the instructions required for the AVX-512
- * pg_popcount() implementation.
- */
-bool
-pg_popcount_avx512_available(void)
-{
- return xsave_available() &&
- zmm_regs_available() &&
- avx512_popcnt_available();
-}
-
/*
* pg_popcount_avx512
* Returns the number of 1-bits in buf
--
2.34.1
From f08e15c0834616c636d1cb949ed140926265847e Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapa...@intel.com>
Date: Thu, 21 Nov 2024 12:42:09 -0800
Subject: [PATCH v10 3/4] Add AVX-512 CRC32C algorithm with a runtime check
Adds pg_crc32c_avx512(): compute the crc32c of the buffer, where the
buffer length must be at least 256, and a multiple of 64. Based on:
"Fast CRC Computation for Generic Polynomials Using PCLMULQDQ
Instruction" V. Gopal, E. Ozturk, et al., 2009"
Benchmark numbers to compare against the SSE4.2 CRC32C algorithm was
generated by using the drive_crc32c() function added in
src/test/modules/test_crc32c/test_crc32c.c.
+------------------+----------------+----------------+------------------+-------+------+
| Rate in bytes/us | SDP (SPR) | m6i | m7i |
| |
+------------------+----------------+----------------+------------------+
Multi-| |
| higher is better | SSE42 | AVX512 | SSE42 | AVX512 | SSE42 | AVX512 | plier
| % |
+==================+=================+=======+========+========+========+=======+======+
| AVG Rate 64-8192 | 10,095 | 82,101 | 8,591 | 38,652 | 11,867 | 83,194 | 6.68
| 568% |
+------------------+--------+--------+-------+--------+--------+--------+-------+------+
| AVG Rate 64-255 | 9,034 | 9,136 | 7,619 | 7,437 | 9,030 | 9,293 | 1.01
| 1% |
+------------------+--------+--------+-------+--------+--------+--------+-------+------+
Co-authored-by: Paul Amonson <paul.d.amon...@intel.com>
---
config/c-compiler.m4 | 32 +++++
configure | 154 ++++++++++++---------
configure.ac | 107 +++++++--------
meson.build | 23 ++++
src/include/pg_config.h.in | 3 +
src/include/pg_cpu.h | 23 ++++
src/include/port/pg_crc32c.h | 55 +++-----
src/include/port/pg_hw_feat_check.h | 6 +
src/port/meson.build | 10 +-
src/port/pg_crc32c_avx512.c | 203 ++++++++++++++++++++++++++++
src/port/pg_crc32c_sse42.c | 2 +
src/port/pg_crc32c_sse42_choose.c | 51 -------
src/port/pg_crc32c_x86_choose.c | 57 ++++++++
src/port/pg_hw_feat_check.c | 75 +++++++++-
14 files changed, 578 insertions(+), 223 deletions(-)
create mode 100644 src/include/pg_cpu.h
create mode 100644 src/port/pg_crc32c_avx512.c
delete mode 100644 src/port/pg_crc32c_sse42_choose.c
create mode 100644 src/port/pg_crc32c_x86_choose.c
diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index e112fd45d4..e08de01739 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -578,6 +578,38 @@ undefine([Ac_cachevar])dnl
])# PGAC_SSE42_CRC32_INTRINSICS
+# PGAC_AVX512_CRC32_INTRINSICS
+# ---------------------------
+# Check if the compiler supports the x86 CRC instructions added in AVX-512,
+# using intrinsics with function __attribute__((target("..."))):
+
+AC_DEFUN([PGAC_AVX512_CRC32_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_crc32_intrinsics])])dnl
+AC_CACHE_CHECK([for _mm512_clmulepi64_epi128 with function attribute],
[Ac_cachevar],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>
+ #include <stdint.h>
+ #if defined(__has_attribute) && __has_attribute (target)
+ __attribute__((target("avx512vl,vpclmulqdq")))
+ #endif
+ static int crc32_avx512_test(void)
+ {
+ __m512i x0 = _mm512_set1_epi32(0x1);
+ __m512i x1 = _mm512_set1_epi32(0x2);
+ __m512i x2 = _mm512_clmulepi64_epi128(x1, x0, 0x00); // vpclmulqdq
+ __m128i a1 = _mm_xor_epi64(_mm512_castsi512_si128(x1),
_mm512_castsi512_si128(x0)); //avx512vl
+ int64_t val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0)); // 64-bit
instruction
+ return (uint32_t)_mm_crc32_u64(val, _mm_extract_epi64(a1, 1));
+ }],
+ [return crc32_avx512_test();])],
+ [Ac_cachevar=yes],
+ [Ac_cachevar=no])])
+if test x"$Ac_cachevar" = x"yes"; then
+ pgac_avx512_crc32_intrinsics=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_AVX512_CRC32_INTRINSICS
+
+
# PGAC_ARMV8_CRC32C_INTRINSICS
# ----------------------------
# Check if the compiler supports the CRC32C instructions using the __crc32cb,
diff --git a/configure b/configure
index 518c33b73a..b03b928bfd 100755
--- a/configure
+++ b/configure
@@ -17159,7 +17159,7 @@ $as_echo "#define USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
1" >>confdefs.h
fi
fi
-# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
+# Check for Intel SSE 4.2 and AVX-512 intrinsics to do CRC calculations.
#
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and
_mm_crc32_u32" >&5
$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32... " >&6; }
@@ -17203,6 +17203,52 @@ if test x"$pgac_cv_sse42_crc32_intrinsics" = x"yes";
then
fi
+# Check if the _mm512_clmulepi64_epi128 and _mm_xor_epi64 can be used with with
+# the __attribute__((target("avx512vl,vpclmulqdq"))).
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_clmulepi64_epi128
with function attribute" >&5
+$as_echo_n "checking for _mm512_clmulepi64_epi128 with function attribute... "
>&6; }
+if ${pgac_cv_avx512_crc32_intrinsics+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+#include <immintrin.h>
+ #include <stdint.h>
+ #if defined(__has_attribute) && __has_attribute (target)
+ __attribute__((target("avx512vl,vpclmulqdq")))
+ #endif
+ static int crc32_avx512_test(void)
+ {
+ __m512i x0 = _mm512_set1_epi32(0x1);
+ __m512i x1 = _mm512_set1_epi32(0x2);
+ __m512i x2 = _mm512_clmulepi64_epi128(x1, x0, 0x00); // vpclmulqdq
+ __m128i a1 = _mm_xor_epi64(_mm512_castsi512_si128(x1),
_mm512_castsi512_si128(x0)); //avx512vl
+ int64_t val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0)); // 64-bit
instruction
+ return (uint32_t)_mm_crc32_u64(val, _mm_extract_epi64(a1, 1));
+ }
+int
+main ()
+{
+return crc32_avx512_test();
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+ pgac_cv_avx512_crc32_intrinsics=yes
+else
+ pgac_cv_avx512_crc32_intrinsics=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+ conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result:
$pgac_cv_avx512_crc32_intrinsics" >&5
+$as_echo "$pgac_cv_avx512_crc32_intrinsics" >&6; }
+if test x"$pgac_cv_avx512_crc32_intrinsics" = x"yes"; then
+ pgac_avx512_crc32_intrinsics=yes
+fi
+
+
# Are we targeting a processor that supports SSE 4.2? gcc, clang and icc all
# define __SSE4_2__ in that case.
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
@@ -17404,9 +17450,8 @@ fi
# If we are targeting a processor that has Intel SSE 4.2 instructions, we can
# use the special CRC instructions for calculating CRC-32C. If we're not
# targeting such a processor, but we can nevertheless produce code that uses
-# the SSE intrinsics, compile both implementations and select which one to use
-# at runtime, depending on whether SSE 4.2 is supported by the processor we're
-# running on.
+# the SSE/AVX-512 intrinsics compile both implementations and select which one
+# to use at runtime, depending runtime cpuid information.
#
# Similarly, if we are targeting an ARM processor that has the CRC
# instructions that are part of the ARMv8 CRC Extension, use them. And if
@@ -17423,95 +17468,80 @@ fi
#
# If we are targeting a LoongArch processor, CRC instructions are
# always available (at least on 64 bit), so no runtime check is needed.
-if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x""
&& test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test
x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" =
x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then
- # Use Intel SSE 4.2 if available.
- if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED"
= x"1" ; then
- USE_SSE42_CRC32C=1
- else
- # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
- # the runtime check.
- if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test
x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
- USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
- else
- # Use ARM CRC Extension if available.
- if test x"$pgac_armv8_crc32c_intrinsics" = x"yes" && test x"$CFLAGS_CRC"
= x""; then
- USE_ARMV8_CRC32C=1
- else
- # ARM CRC Extension, with runtime check?
- if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
- USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1
- else
- # LoongArch CRCC instructions.
- if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
- USE_LOONGARCH_CRC32C=1
- else
- # fall back to slicing-by-8 algorithm, which doesn't require any
- # special CPU support.
- USE_SLICING_BY_8_CRC32C=1
- fi
- fi
- fi
- fi
- fi
-fi
-# Set PG_CRC32C_OBJS appropriately depending on the selected implementation.
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking which CRC-32C implementation
to use" >&5
$as_echo_n "checking which CRC-32C implementation to use... " >&6; }
-if test x"$USE_SSE42_CRC32C" = x"1"; then
+if test x"$host_cpu" = x"x86_64"; then
+ #x86 only:
+ PG_CRC32C_OBJS="pg_crc32c_sb8.o pg_crc32c_x86_choose.o"
+ if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test
x"$SSE4_2_TARGETED" = x"1" ; then
$as_echo "#define USE_SSE42_CRC32C 1" >>confdefs.h
- PG_CRC32C_OBJS="pg_crc32c_sse42.o"
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5
-$as_echo "SSE 4.2" >&6; }
-else
- if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
+ PG_CRC32C_OBJS+=" pg_crc32c_sse42.o"
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: CRC32C baseline feature
SSE 4.2" >&5
+$as_echo "CRC32C baseline feature SSE 4.2" >&6; }
+ else
+ if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test
x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
$as_echo "#define USE_SSE42_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
- PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_crc32c_sse42_choose.o"
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime
check" >&5
-$as_echo "SSE 4.2 with runtime check" >&6; }
- else
- if test x"$USE_ARMV8_CRC32C" = x"1"; then
+ PG_CRC32C_OBJS+=" pg_crc32c_sse42.o"
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: CRC32C SSE42 with
runtime check" >&5
+$as_echo "CRC32C SSE42 with runtime check" >&6; }
+ fi
+ fi
+ if test x"$pgac_avx512_crc32_intrinsics" = x"yes" && (test
x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+
+$as_echo "#define USE_AVX512_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+ PG_CRC32C_OBJS+=" pg_crc32c_avx512.o"
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: CRC32C AVX-512 with
runtime check" >&5
+$as_echo "CRC32C AVX-512 with runtime check" >&6; }
+ fi
+else
+ # non x86 code:
+ # Use ARM CRC Extension if available.
+ if test x"$pgac_armv8_crc32c_intrinsics" = x"yes" && test x"$CFLAGS_CRC" =
x""; then
$as_echo "#define USE_ARMV8_CRC32C 1" >>confdefs.h
- PG_CRC32C_OBJS="pg_crc32c_armv8.o"
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions"
>&5
+ PG_CRC32C_OBJS="pg_crc32c_armv8.o"
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions"
>&5
$as_echo "ARMv8 CRC instructions" >&6; }
- else
- if test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
+ else
+ # ARM CRC Extension, with runtime check?
+ if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
$as_echo "#define USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
- PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o
pg_crc32c_armv8_choose.o"
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC
instructions with runtime check" >&5
+ PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o
pg_crc32c_armv8_choose.o"
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions
with runtime check" >&5
$as_echo "ARMv8 CRC instructions with runtime check" >&6; }
- else
- if test x"$USE_LOONGARCH_CRC32C" = x"1"; then
+ else
+ # LoongArch CRCC instructions.
+ if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
$as_echo "#define USE_LOONGARCH_CRC32C 1" >>confdefs.h
- PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: LoongArch CRCC
instructions" >&5
+ PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: LoongArch CRCC
instructions" >&5
$as_echo "LoongArch CRCC instructions" >&6; }
- else
+ else
+ # fall back to slicing-by-8 algorithm, which doesn't require any
+ # special CPU support.
$as_echo "#define USE_SLICING_BY_8_CRC32C 1" >>confdefs.h
- PG_CRC32C_OBJS="pg_crc32c_sb8.o"
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5
+ PG_CRC32C_OBJS="pg_crc32c_sb8.o"
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5
$as_echo "slicing-by-8" >&6; }
- fi
fi
fi
fi
fi
-
# Select semaphore implementation type.
if test "$PORTNAME" != "win32"; then
if test x"$PREFERRED_SEMAPHORES" = x"NAMED_POSIX" ; then
diff --git a/configure.ac b/configure.ac
index 247ae97fa4..96a9c2db1f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2021,10 +2021,14 @@ if test x"$host_cpu" = x"x86_64"; then
fi
fi
-# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
+# Check for Intel SSE 4.2 and AVX-512 intrinsics to do CRC calculations.
#
PGAC_SSE42_CRC32_INTRINSICS()
+# Check if the _mm512_clmulepi64_epi128 and _mm_xor_epi64 can be used with with
+# the __attribute__((target("avx512vl,vpclmulqdq"))).
+PGAC_AVX512_CRC32_INTRINSICS([])
+
# Are we targeting a processor that supports SSE 4.2? gcc, clang and icc all
# define __SSE4_2__ in that case.
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [
@@ -2060,9 +2064,8 @@ AC_SUBST(CFLAGS_CRC)
# If we are targeting a processor that has Intel SSE 4.2 instructions, we can
# use the special CRC instructions for calculating CRC-32C. If we're not
# targeting such a processor, but we can nevertheless produce code that uses
-# the SSE intrinsics, compile both implementations and select which one to use
-# at runtime, depending on whether SSE 4.2 is supported by the processor we're
-# running on.
+# the SSE/AVX-512 intrinsics compile both implementations and select which one
+# to use at runtime, depending runtime cpuid information.
#
# Similarly, if we are targeting an ARM processor that has the CRC
# instructions that are part of the ARMv8 CRC Extension, use them. And if
@@ -2079,76 +2082,58 @@ AC_SUBST(CFLAGS_CRC)
#
# If we are targeting a LoongArch processor, CRC instructions are
# always available (at least on 64 bit), so no runtime check is needed.
-if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x""
&& test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test
x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" =
x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then
- # Use Intel SSE 4.2 if available.
- if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED"
= x"1" ; then
- USE_SSE42_CRC32C=1
- else
- # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
- # the runtime check.
- if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test
x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
- USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
+
+AC_MSG_CHECKING([which CRC-32C implementation to use])
+if test x"$host_cpu" = x"x86_64"; then
+ #x86 only:
+ PG_CRC32C_OBJS="pg_crc32c_sb8.o pg_crc32c_x86_choose.o"
+ if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test
x"$SSE4_2_TARGETED" = x"1" ; then
+ AC_DEFINE(USE_SSE42_CRC32C, 1, [Define to 1 use Intel SSE 4.2 CRC
instructions.])
+ PG_CRC32C_OBJS+=" pg_crc32c_sse42.o"
+ AC_MSG_RESULT(CRC32C baseline feature SSE 4.2)
else
- # Use ARM CRC Extension if available.
- if test x"$pgac_armv8_crc32c_intrinsics" = x"yes" && test x"$CFLAGS_CRC"
= x""; then
- USE_ARMV8_CRC32C=1
- else
- # ARM CRC Extension, with runtime check?
- if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
- USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1
- else
- # LoongArch CRCC instructions.
- if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
- USE_LOONGARCH_CRC32C=1
- else
- # fall back to slicing-by-8 algorithm, which doesn't require any
- # special CPU support.
- USE_SLICING_BY_8_CRC32C=1
- fi
+ if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test
x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+ AC_DEFINE(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to
use Intel SSE 4.2 CRC instructions with a runtime check.])
+ PG_CRC32C_OBJS+=" pg_crc32c_sse42.o"
+ AC_MSG_RESULT(CRC32C SSE42 with runtime check)
fi
- fi
fi
- fi
-fi
-
-# Set PG_CRC32C_OBJS appropriately depending on the selected implementation.
-AC_MSG_CHECKING([which CRC-32C implementation to use])
-if test x"$USE_SSE42_CRC32C" = x"1"; then
- AC_DEFINE(USE_SSE42_CRC32C, 1, [Define to 1 use Intel SSE 4.2 CRC
instructions.])
- PG_CRC32C_OBJS="pg_crc32c_sse42.o"
- AC_MSG_RESULT(SSE 4.2)
+ if test x"$pgac_avx512_crc32_intrinsics" = x"yes" && (test
x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+ AC_DEFINE(USE_AVX512_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use
Intel AVX 512 CRC instructions with a runtime check.])
+ PG_CRC32C_OBJS+=" pg_crc32c_avx512.o"
+ AC_MSG_RESULT(CRC32C AVX-512 with runtime check)
+ fi
else
- if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
- AC_DEFINE(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use
Intel SSE 4.2 CRC instructions with a runtime check.])
- PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_crc32c_sse42_choose.o"
- AC_MSG_RESULT(SSE 4.2 with runtime check)
+ # non x86 code:
+ # Use ARM CRC Extension if available.
+ if test x"$pgac_armv8_crc32c_intrinsics" = x"yes" && test x"$CFLAGS_CRC" =
x""; then
+ AC_DEFINE(USE_ARMV8_CRC32C, 1, [Define to 1 to use ARMv8 CRC Extension.])
+ PG_CRC32C_OBJS="pg_crc32c_armv8.o"
+ AC_MSG_RESULT(ARMv8 CRC instructions)
else
- if test x"$USE_ARMV8_CRC32C" = x"1"; then
- AC_DEFINE(USE_ARMV8_CRC32C, 1, [Define to 1 to use ARMv8 CRC Extension.])
- PG_CRC32C_OBJS="pg_crc32c_armv8.o"
- AC_MSG_RESULT(ARMv8 CRC instructions)
+ # ARM CRC Extension, with runtime check?
+ if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
+ AC_DEFINE(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use
ARMv8 CRC Extension with a runtime check.])
+ PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o
pg_crc32c_armv8_choose.o"
+ AC_MSG_RESULT(ARMv8 CRC instructions with runtime check)
else
- if test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
- AC_DEFINE(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use
ARMv8 CRC Extension with a runtime check.])
- PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o
pg_crc32c_armv8_choose.o"
- AC_MSG_RESULT(ARMv8 CRC instructions with runtime check)
+ # LoongArch CRCC instructions.
+ if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
+ AC_DEFINE(USE_LOONGARCH_CRC32C, 1, [Define to 1 to use LoongArch CRCC
instructions.])
+ PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
+ AC_MSG_RESULT(LoongArch CRCC instructions)
else
- if test x"$USE_LOONGARCH_CRC32C" = x"1"; then
- AC_DEFINE(USE_LOONGARCH_CRC32C, 1, [Define to 1 to use LoongArch
CRCC instructions.])
- PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
- AC_MSG_RESULT(LoongArch CRCC instructions)
- else
- AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software
CRC-32C implementation (slicing-by-8).])
- PG_CRC32C_OBJS="pg_crc32c_sb8.o"
- AC_MSG_RESULT(slicing-by-8)
- fi
+ # fall back to slicing-by-8 algorithm, which doesn't require any
+ # special CPU support.
+ AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software
CRC-32C implementation (slicing-by-8).])
+ PG_CRC32C_OBJS="pg_crc32c_sb8.o"
+ AC_MSG_RESULT(slicing-by-8)
fi
fi
fi
fi
AC_SUBST(PG_CRC32C_OBJS)
-
# Select semaphore implementation type.
if test "$PORTNAME" != "win32"; then
if test x"$PREFERRED_SEMAPHORES" = x"NAMED_POSIX" ; then
diff --git a/meson.build b/meson.build
index e5ce437a5c..5833661d71 100644
--- a/meson.build
+++ b/meson.build
@@ -2222,6 +2222,23 @@ if host_cpu == 'x86' or host_cpu == 'x86_64'
have_optimized_crc = true
else
+ avx512_crc_prog = '''
+#include <immintrin.h>
+#include <stdint.h>
+#if defined(__has_attribute) && __has_attribute (target)
+__attribute__((target("avx512vl,vpclmulqdq")))
+#endif
+int main(void)
+{
+ __m512i x0 = _mm512_set1_epi32(0x1);
+ __m512i x1 = _mm512_set1_epi32(0x2);
+ __m512i x2 = _mm512_clmulepi64_epi128(x1, x0, 0x00); // vpclmulqdq
+ __m128i a1 = _mm_xor_epi64(_mm512_castsi512_si128(x1),
_mm512_castsi512_si128(x0)); //avx512vl
+ int64_t val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0)); // 64-bit
instruction
+ return (uint32_t)_mm_crc32_u64(val, _mm_extract_epi64(a1, 1));
+}
+'''
+
prog = '''
#include <nmmintrin.h>
@@ -2252,6 +2269,12 @@ int main(void)
cdata.set('USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 1)
have_optimized_crc = true
endif
+ if cc.links(avx512_crc_prog,
+ name: 'AVX512 CRC32C with function attributes',
+ args: test_c_args)
+ cdata.set('USE_AVX512_CRC32C_WITH_RUNTIME_CHECK', 1)
+ have_optimized_crc = true
+ endif
endif
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 07b2f798ab..db40e6476d 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -697,6 +697,9 @@
/* Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check. */
#undef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
+/* Define to 1 to use Intel AVX-512 CRC instructions with a runtime check. */
+#undef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
+
/* Define to build with systemd support. (--with-systemd) */
#undef USE_SYSTEMD
diff --git a/src/include/pg_cpu.h b/src/include/pg_cpu.h
new file mode 100644
index 0000000000..223994cb0d
--- /dev/null
+++ b/src/include/pg_cpu.h
@@ -0,0 +1,23 @@
+/*
+ * pg_cpu.h
+ * Useful macros to determine CPU types
+ */
+
+#ifndef PG_CPU_H_
+#define PG_CPU_H_
+#if defined( __i386__ ) || defined(i386) || defined(_M_IX86)
+ /*
+ * __i386__ is defined by gcc and Intel compiler on Linux,
+ * _M_IX86 by VS compiler,
+ * i386 by Sun compilers on opensolaris at least
+ */
+ #define PG_CPU_X86
+#elif defined(__x86_64__) || defined(__amd64__) || defined(__x86_64) ||
defined(_M_AMD64)
+ /*
+ * both __x86_64__ and __amd64__ are defined by gcc
+ * __x86_64 defined by sun compiler on opensolaris at least
+ * _M_AMD64 defined by MS compiler
+ */
+ #define PG_CPU_x86_64
+#endif
+#endif // PG_CPU_H_
diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h
index 63c8e3a00b..690273506b 100644
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -34,58 +34,43 @@
#define PG_CRC32C_H
#include "port/pg_bswap.h"
+#include "pg_cpu.h"
typedef uint32 pg_crc32c;
/* The INIT and EQ macros are the same for all implementations. */
#define INIT_CRC32C(crc) ((crc) = 0xFFFFFFFF)
#define EQ_CRC32C(c1, c2) ((c1) == (c2))
-
-#if defined(USE_SSE42_CRC32C)
-/* Use Intel SSE4.2 instructions. */
-#define COMP_CRC32C(crc, data, len) \
- ((crc) = pg_comp_crc32c_sse42((crc), (data), (len)))
#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
+/* x86 */
+#if defined(PG_CPU_X86) || defined(PG_CPU_x86_64)
+extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t
len);
extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t
len);
+extern pg_crc32c pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t
len);
+extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t
len);
+#define COMP_CRC32C(crc, data, len) \
+ ((crc) = pg_comp_crc32c((crc), (data), (len)))
+/* ARMV8 */
#elif defined(USE_ARMV8_CRC32C)
-/* Use ARMv8 CRC Extension instructions. */
-
+extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t
len);
#define COMP_CRC32C(crc, data, len)
\
((crc) = pg_comp_crc32c_armv8((crc), (data), (len)))
-#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
+/* ARMV8 with runtime check */
+#elif defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
+extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t
len);
extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t
len);
+extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t
len);
+#define COMP_CRC32C(crc, data, len) \
+ ((crc) = pg_comp_crc32c((crc), (data), (len)))
+/* LoongArch */
#elif defined(USE_LOONGARCH_CRC32C)
-/* Use LoongArch CRCC instructions. */
-
+extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data,
size_t len);
#define COMP_CRC32C(crc, data, len)
\
((crc) = pg_comp_crc32c_loongarch((crc), (data), (len)))
-#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
-
-extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data,
size_t len);
-
-#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) ||
defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
-
-/*
- * Use Intel SSE 4.2 or ARMv8 instructions, but perform a runtime check first
- * to check that they are available.
- */
-#define COMP_CRC32C(crc, data, len) \
- ((crc) = pg_comp_crc32c((crc), (data), (len)))
-#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
-
-extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t
len);
-extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t
len);
-
-#ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
-extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t
len);
-#endif
-#ifdef USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK
-extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t
len);
-#endif
#else
/*
@@ -98,13 +83,11 @@ extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const
void *data, size_t le
#define COMP_CRC32C(crc, data, len) \
((crc) = pg_comp_crc32c_sb8((crc), (data), (len)))
#ifdef WORDS_BIGENDIAN
+#undef FIN_CRC32C
#define FIN_CRC32C(crc) ((crc) = pg_bswap32(crc) ^ 0xFFFFFFFF)
-#else
-#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
#endif
extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t
len);
-
#endif
#endif /* PG_CRC32C_H */
diff --git a/src/include/port/pg_hw_feat_check.h
b/src/include/port/pg_hw_feat_check.h
index 58be900b54..3a73014987 100644
--- a/src/include/port/pg_hw_feat_check.h
+++ b/src/include/port/pg_hw_feat_check.h
@@ -30,4 +30,10 @@ extern PGDLLIMPORT bool pg_popcount_available(void);
* available.
*/
extern PGDLLIMPORT bool pg_popcount_avx512_available(void);
+
+/*
+ * Test to see if all hardware features required by the AVX-512 SIMD
+ * algorithm are available.
+ */
+extern PGDLLIMPORT bool pg_crc32c_avx512_available(void);
#endif /* PG_HW_FEAT_CHECK_H */
diff --git a/src/port/meson.build b/src/port/meson.build
index ec28590473..0ba4a56194 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -8,8 +8,10 @@ pgport_sources = [
'path.c',
'pg_bitutils.c',
'pg_popcount_avx512.c',
- 'pg_crc32c_sse42_choose.c',
+ 'pg_crc32c_x86_choose.c',
+ 'pg_crc32c_avx512.c',
'pg_crc32c_sse42.c',
+ 'pg_crc32c_sb8.c',
'pg_hw_feat_check.c',
'pg_strong_random.c',
'pgcheckdir.c',
@@ -83,12 +85,6 @@ endif
# Replacement functionality to be built if corresponding configure symbol
# is true
replace_funcs_pos = [
- # x86/x64
- ['pg_crc32c_sse42', 'USE_SSE42_CRC32C'],
- ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
- ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
- ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
-
# arm / aarch64
['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'],
['pg_crc32c_armv8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
diff --git a/src/port/pg_crc32c_avx512.c b/src/port/pg_crc32c_avx512.c
new file mode 100644
index 0000000000..ba4defcefd
--- /dev/null
+++ b/src/port/pg_crc32c_avx512.c
@@ -0,0 +1,203 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_crc32c_avx512.c
+ * Compute CRC-32C checksum using Intel AVX-512 instructions.
+ *
+ * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/port/pg_crc32c_avx512.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#if defined(USE_AVX512_CRC32C_WITH_RUNTIME_CHECK)
+
+#include <immintrin.h>
+
+#include "port/pg_crc32c.h"
+
+
+/*******************************************************************
+ * pg_crc32c_avx512(): compute the crc32c of the buffer, where the
+ * buffer length must be at least 256, and a multiple of 64. Based
+ * on:
+ *
+ * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ
+ * Instruction"
+ * V. Gopal, E. Ozturk, et al., 2009
+ *
+ * For This Function:
+ * Copyright 2015 The Chromium Authors
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Google LLC nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+pg_attribute_no_sanitize_alignment()
+pg_attribute_target("avx512vl,vpclmulqdq")
+inline pg_crc32c
+pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t length)
+{
+ static const uint64 k1k2[8] = {
+ 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4,
+ 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86};
+ static const uint64 k3k4[8] = {
+ 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8, 0x740eef02,
+ 0x9e4addf8, 0x740eef02, 0x9e4addf8};
+ static const uint64 k9k10[8] = {
+ 0x6992cea2, 0x0d3b6092, 0x6992cea2, 0x0d3b6092, 0x6992cea2,
+ 0x0d3b6092, 0x6992cea2, 0x0d3b6092};
+ static const uint64 k1k4[8] = {
+ 0x1c291d04, 0xddc0152b, 0x3da6d0cb, 0xba4fc28e, 0xf20c0dfe,
+ 0x493c7d27, 0x00000000, 0x00000000};
+
+ const uint8 *input = (const uint8 *)data;
+ if (length >= 256)
+ {
+ uint64 val;
+ __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
+ __m128i a1, a2;
+
+ /*
+ * AVX-512 Optimized crc32c algorithm with mimimum of 256 bytes
aligned
+ * to 32 bytes.
+ * >>> BEGIN
+ */
+
+ /*
+ * There's at least one block of 256.
+ */
+ x1 = _mm512_loadu_si512((__m512i *)(input + 0x00));
+ x2 = _mm512_loadu_si512((__m512i *)(input + 0x40));
+ x3 = _mm512_loadu_si512((__m512i *)(input + 0x80));
+ x4 = _mm512_loadu_si512((__m512i *)(input + 0xC0));
+
+ x1 = _mm512_xor_si512(x1,
_mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
+
+ x0 = _mm512_load_si512((__m512i *)k1k2);
+
+ input += 256;
+ length -= 256;
+
+ /*
+ * Parallel fold blocks of 256, if any.
+ */
+ while (length >= 256)
+ {
+ x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+ x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
+ x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
+ x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);
+
+ x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+ x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
+ x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
+ x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);
+
+ y5 = _mm512_loadu_si512((__m512i *)(input + 0x00));
+ y6 = _mm512_loadu_si512((__m512i *)(input + 0x40));
+ y7 = _mm512_loadu_si512((__m512i *)(input + 0x80));
+ y8 = _mm512_loadu_si512((__m512i *)(input + 0xC0));
+
+ x1 = _mm512_ternarylogic_epi64(x1, x5, y5, 0x96);
+ x2 = _mm512_ternarylogic_epi64(x2, x6, y6, 0x96);
+ x3 = _mm512_ternarylogic_epi64(x3, x7, y7, 0x96);
+ x4 = _mm512_ternarylogic_epi64(x4, x8, y8, 0x96);
+
+ input += 256;
+ length -= 256;
+ }
+
+ /*
+ * Fold 256 bytes into 64 bytes.
+ */
+ x0 = _mm512_load_si512((__m512i *)k9k10);
+ x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+ x6 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+ x3 = _mm512_ternarylogic_epi64(x3, x5, x6, 0x96);
+
+ x7 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
+ x8 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
+ x4 = _mm512_ternarylogic_epi64(x4, x7, x8, 0x96);
+
+ x0 = _mm512_load_si512((__m512i *)k3k4);
+ y5 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
+ y6 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
+ x1 = _mm512_ternarylogic_epi64(x4, y5, y6, 0x96);
+
+ /*
+ * Single fold blocks of 64, if any.
+ */
+ while (length >= 64)
+ {
+ x2 = _mm512_loadu_si512((__m512i *)input);
+
+ x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+ x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+ x1 = _mm512_ternarylogic_epi64(x1, x2, x5, 0x96);
+
+ input += 64;
+ length -= 64;
+ }
+
+ /*
+ * Fold 512-bits to 128-bits.
+ */
+ x0 = _mm512_loadu_si512((__m512i *)k1k4);
+
+ a2 = _mm512_extracti32x4_epi32(x1, 3);
+ x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+ x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+ x1 = _mm512_ternarylogic_epi64(x1, x5,
_mm512_castsi128_si512(a2), 0x96);
+
+ x0 = _mm512_shuffle_i64x2(x1, x1, 0x4E);
+ x0 = _mm512_xor_epi64(x1, x0);
+ a1 = _mm512_extracti32x4_epi32(x0, 1);
+ a1 = _mm_xor_epi64(a1, _mm512_castsi512_si128(x0));
+
+ /*
+ * Fold 128-bits to 32-bits.
+ */
+ val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0));
+ crc = (uint32_t)_mm_crc32_u64(val, _mm_extract_epi64(a1, 1));
+ /*
+ * AVX-512 Optimized crc32c algorithm with mimimum of 256 bytes
aligned
+ * to 32 bytes.
+ * <<< END
+
******************************************************************/
+ }
+
+ /*
+ * Finish any remaining bytes with legacy AVX algorithm.
+ */
+ return pg_comp_crc32c_sse42(crc, input, length);
+}
+#endif // AVX512_CRC32
diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index dcc4904a82..90d155e804 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -14,6 +14,7 @@
*/
#include "c.h"
+#if defined(USE_SSE42_CRC32C) || defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK)
#include <nmmintrin.h>
#include "port/pg_crc32c.h"
@@ -68,3 +69,4 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t
len)
return crc;
}
+#endif
diff --git a/src/port/pg_crc32c_sse42_choose.c
b/src/port/pg_crc32c_sse42_choose.c
deleted file mode 100644
index c659917af0..0000000000
--- a/src/port/pg_crc32c_sse42_choose.c
+++ /dev/null
@@ -1,51 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * pg_crc32c_sse42_choose.c
- * Choose between Intel SSE 4.2 and software CRC-32C implementation.
- *
- * On first call, checks if the CPU we're running on supports Intel SSE
- * 4.2. If it does, use the special SSE instructions for CRC-32C
- * computation. Otherwise, fall back to the pure software implementation
- * (slicing-by-8).
- *
- * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
- * Portions Copyright (c) 1994, Regents of the University of California
- *
- *
- * IDENTIFICATION
- * src/port/pg_crc32c_sse42_choose.c
- *
- *-------------------------------------------------------------------------
- */
-
-#include "c.h"
-
-#if defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK)
-#ifdef HAVE__GET_CPUID
-#include <cpuid.h>
-#endif
-
-#ifdef HAVE__CPUID
-#include <intrin.h>
-#endif
-
-#include "port/pg_crc32c.h"
-#include "port/pg_hw_feat_check.h"
-
-/*
- * This gets called on the first call. It replaces the function pointer
- * so that subsequent calls are routed directly to the chosen implementation.
- */
-static pg_crc32c
-pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
-{
- if (pg_crc32c_sse42_available())
- pg_comp_crc32c = pg_comp_crc32c_sse42;
- else
- pg_comp_crc32c = pg_comp_crc32c_sb8;
-
- return pg_comp_crc32c(crc, data, len);
-}
-
-pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) =
pg_comp_crc32c_choose;
-#endif
diff --git a/src/port/pg_crc32c_x86_choose.c b/src/port/pg_crc32c_x86_choose.c
new file mode 100644
index 0000000000..3ce8be11a6
--- /dev/null
+++ b/src/port/pg_crc32c_x86_choose.c
@@ -0,0 +1,57 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_crc32c_x86_choose.c
+ * Choose between Intel AVX-512, SSE 4.2 and software CRC-32C
implementation.
+ *
+ * On first call, checks if the CPU we're running on supports Intel AVX-512. If
+ * it does, use the special SSE instructions for CRC-32C computation.
+ * Otherwise, fall back to the pure software implementation (slicing-by-8).
+ *
+ * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/port/pg_crc32c_x86_choose.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+#include "pg_cpu.h"
+
+#if defined(PG_CPU_X86) || defined(PG_CPU_x86_64)
+
+#include "port/pg_crc32c.h"
+#include "port/pg_hw_feat_check.h"
+
+/*
+ * This gets called on the first call. It replaces the function pointer
+ * so that subsequent calls are routed directly to the chosen implementation.
+ * (1) set pg_comp_crc32c pointer and (2) return the computed crc value
+ */
+static pg_crc32c
+pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
+{
+#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
+ if (pg_crc32c_avx512_available()) {
+ pg_comp_crc32c = pg_comp_crc32c_avx512;
+ return pg_comp_crc32c(crc, data, len);
+ }
+#endif
+#ifdef USE_SSE42_CRC32C
+ pg_comp_crc32c = pg_comp_crc32c_sse42;
+ return pg_comp_crc32c(crc, data, len);
+#elif USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
+ if (pg_crc32c_sse42_available()) {
+ pg_comp_crc32c = pg_comp_crc32c_sse42;
+ return pg_comp_crc32c(crc, data, len);
+ }
+#endif
+ pg_comp_crc32c = pg_comp_crc32c_sb8;
+ return pg_comp_crc32c(crc, data, len);
+}
+
+pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) =
pg_comp_crc32c_choose;
+
+#endif // x86/x86_64
diff --git a/src/port/pg_hw_feat_check.c b/src/port/pg_hw_feat_check.c
index 260aa60502..b2872fa708 100644
--- a/src/port/pg_hw_feat_check.c
+++ b/src/port/pg_hw_feat_check.c
@@ -11,6 +11,9 @@
*-------------------------------------------------------------------------
*/
#include "c.h"
+#include "pg_cpu.h"
+
+#if defined(PG_CPU_X86) || defined(PG_CPU_x86_64)
#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
#include <cpuid.h>
@@ -135,9 +138,60 @@ bool PGDLLIMPORT pg_popcount_available(void)
return is_bit_set_in_exx(exx, ECX, 23);
}
+/*
+ * Check for CPU supprt for CPUIDEX: avx512-f
+ */
+inline static bool
+avx512f_available(void)
+{
+ exx_t exx[4] = {0, 0, 0, 0};
+
+ pg_getcpuidex(7, 0, exx);
+ return is_bit_set_in_exx(exx, EBX, 16); /* avx512-f */
+}
+
+/*
+ * Check for CPU supprt for CPUIDEX: vpclmulqdq
+ */
+inline static bool
+vpclmulqdq_available(void)
+{
+ exx_t exx[4] = {0, 0, 0, 0};
+
+ pg_getcpuidex(7, 0, exx);
+ return is_bit_set_in_exx(exx, ECX, 10); /* vpclmulqdq */
+}
+
+/*
+ * Check for CPU supprt for CPUIDEX: vpclmulqdq
+ */
+inline static bool
+avx512vl_available(void)
+{
+ exx_t exx[4] = {0, 0, 0, 0};
+
+ pg_getcpuidex(7, 0, exx);
+ return is_bit_set_in_exx(exx, EBX, 31); /* avx512-vl */
+}
+
+/*
+ * Check for CPU supprt for CPUID: sse4.2
+ */
+inline static bool
+sse42_available(void)
+{
+ exx_t exx[4] = {0, 0, 0, 0};
+
+ pg_getcpuid(1, exx);
+ return is_bit_set_in_exx(exx, ECX, 20); /* sse4.2 */
+}
+
+/****************************************************************************/
+/* Public API */
+/****************************************************************************/
/*
- * Returns true if the CPU supports the instructions required for the AVX-512
- * pg_popcount() implementation.
+ * Returns true if the CPU supports the instructions required for the
+ * AVX-512 pg_popcount() implementation.
*
* PA: The call to 'osxsave_available' MUST preceed the call to
* 'zmm_regs_available' function per NB above.
@@ -154,10 +208,19 @@ bool PGDLLIMPORT pg_popcount_avx512_available(void)
*/
bool PGDLLIMPORT pg_crc32c_sse42_available(void)
{
- exx_t exx[4] = {0, 0, 0, 0};
-
- pg_getcpuid(1, exx);
+ return sse42_available();
+}
- return is_bit_set_in_exx(exx, ECX, 20);
+/*
+ * Returns true if the CPU supports the instructions required for the AVX-512
+ * pg_crc32c implementation.
+ */
+bool PGDLLIMPORT
+pg_crc32c_avx512_available(void)
+{
+ return sse42_available() && osxsave_available() &&
+ avx512f_available() && vpclmulqdq_available() &&
+ avx512vl_available() && zmm_regs_available();
}
+#endif // #if defined(PG_CPU_X86) || defined(PG_CPU_x86_64)
--
2.34.1
From 6e8f557c857772b0c22607866d1b8930a67df05e Mon Sep 17 00:00:00 2001
From: Matthew Sterrett <matthew.sterr...@intel.com>
Date: Wed, 18 Dec 2024 14:11:33 -0800
Subject: [PATCH v10 4/4] Mark pg_comp_crc32c as PGDLLIMPORT for Windows build
---
src/include/port/pg_crc32c.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h
index 690273506b..534d07dd5d 100644
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -48,7 +48,7 @@ typedef uint32 pg_crc32c;
extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t
len);
extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t
len);
extern pg_crc32c pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t
len);
-extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t
len);
+extern PGDLLIMPORT pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void
*data, size_t len);
#define COMP_CRC32C(crc, data, len) \
((crc) = pg_comp_crc32c((crc), (data), (len)))
--
2.34.1