Re: [PATCH] SVE popcount support

Nathan Bossart Wed, 26 Mar 2025 14:44:46 -0700

I've attached a new set of patches in which I've tried to address John's
feedback.  I ran some new benchmarks with these patches.  "M3" is an Apple
M3 (my laptop), "G3" is an r7g.4xlarge, and "G4" is an r8g.4xlarge.  "no
SVE" means the patches are applied but the function pointer points to the
Neon implementation.  "SVE" and "patched" mean all the patches are applied
with no changes.


 8 byte words | M3 HEAD | M3 patched | G3 HEAD | G3 no SVE | G3 SVE  | G4 HEAD 
| G4 no SVE | G4 SVE
--------------+---------+------------+---------+-----------+---------+---------+-----------+---------
            1 |     3.6 |     3.0    |     3.1 |     2.9   |     3.1 |     2.5 
|     2.2   |     1.8
            2 |     6.4 |     4.4    |     3.1 |     3.0   |     3.1 |     2.5 
|     2.5   |     2.0
            3 |     7.3 |     6.9    |     3.5 |     3.5   |     3.1 |     3.3 
|     3.2   |     2.0
            4 |     8.0 |     3.8    |     4.0 |     2.7   |     4.7 |     3.6 
|     2.2   |     2.7
            5 |     9.4 |     5.5    |     4.6 |     2.8   |     4.6 |     3.9 
|     2.5   |     2.7
            6 |     7.9 |     5.0    |     5.1 |     3.5   |     4.7 |     4.3 
|     3.1   |     3.4
            7 |    10.2 |     7.4    |     5.9 |     4.0   |     4.7 |     4.7 
|     3.6   |     3.4
            8 |    12.0 |     5.4    |     6.5 |     4.0   |     5.9 |     5.0 
|     3.2   |     2.5
            9 |    11.7 |     6.5    |     7.2 |     4.3   |     5.9 |     5.4 
|     3.6   |     2.5
           10 |    12.5 |     5.4    |     8.0 |     4.8   |     5.9 |     6.2 
|     3.9   |     3.1
           11 |    14.0 |     8.6    |     8.5 |     5.5   |     5.9 |     6.1 
|     5.0   |     3.1
           12 |    13.1 |     5.7    |     9.1 |     5.1   |     7.4 |     6.4 
|     3.9   |     3.6
           13 |    12.1 |     6.8    |     9.8 |     5.4   |     7.3 |     6.8 
|     4.3   |     3.6
           14 |    16.4 |     7.8    |    10.4 |     5.9   |     7.4 |     7.2 
|     4.7   |     4.4
           15 |    17.4 |     8.0    |    11.1 |     6.6   |     7.4 |     7.5 
|     5.7   |     4.4
           16 |    15.5 |     5.7    |    11.8 |     5.7   |     4.7 |     7.9 
|     5.0   |     3.5
           32 |    26.0 |    16.2    |    22.7 |    10.3   |     6.2 |    16.8 
|     8.4   |     5.2
           64 |    38.5 |    20.3    |    42.7 |    20.1   |     9.3 |    31.8 
|    15.4   |     8.8
          128 |    75.1 |    35.7    |    86.1 |    35.0   |    15.4 |    80.2 
|    28.6   |    16.3
          256 |   117.7 |    51.8    |   179.6 |    68.2   |    27.8 |   154.0 
|    55.7   |    30.9
          512 |   198.5 |    93.1    |   329.3 |   134.4   |    52.4 |   246.5 
|   110.2   |    59.4
         1024 |   355.0 |   159.2    |   673.6 |   265.8   |   101.7 |   487.0 
|   219.0   |   114.7
         2048 |   669.5 |   288.8    |  1294.7 |   529.7   |   200.3 |   969.3 
|   438.7   |   228.5
         4096 |  1308.0 |   552.8    |  2784.3 |  1063.0   |   397.4 |  1934.5 
|   874.4   |   455.9

IMHO these are acceptable results, at least for the use-cases I see in the
tree.  We might be able to minimize the difference between the Neon and SVE
implementations on the low end with some additional code, but I'm really
not sure if it's worth the effort.

Barring feedback or objections, I'm planning to commit these on Friday.

-- 
nathan

>From 1a8d7b9552efa3bbbbde23be4b18b8031520150a Mon Sep 17 00:00:00 2001
From: Nathan Bossart <[email protected]>
Date: Mon, 24 Mar 2025 19:48:41 -0500
Subject: [PATCH v9 1/3] Rename TRY_POPCNT_FAST to TRY_POPCNT_X86_64.

This macro guards x86_64-specific code, and a follow-up commit will
add AArch64-specific versions of that code.  To avoid confusion,
let's rename TRY_POPCNT_FAST to make it more obvious that it's for
x86_64.

Discussion: 
https://postgr.es/m/010101936e4aaa70-b474ab9e-b9ce-474d-a3ba-a3dc223d295c-000000%40us-west-2.amazonses.com
---
 src/include/port/pg_bitutils.h |  6 +++---
 src/port/pg_bitutils.c         | 14 +++++++-------
 src/port/pg_popcount_avx512.c  |  8 ++++----
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 62554ce685a..3067ff402ba 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -294,11 +294,11 @@ pg_ceil_log2_64(uint64 num)
  */
 #ifdef HAVE_X86_64_POPCNTQ
 #if defined(HAVE__GET_CPUID) || defined(HAVE__CPUID)
-#define TRY_POPCNT_FAST 1
+#define TRY_POPCNT_X86_64 1
 #endif
 #endif
 
-#ifdef TRY_POPCNT_FAST
+#ifdef TRY_POPCNT_X86_64
 /* Attempt to use the POPCNT instruction, but perform a runtime check first */
 extern PGDLLIMPORT int (*pg_popcount32) (uint32 word);
 extern PGDLLIMPORT int (*pg_popcount64) (uint64 word);
@@ -322,7 +322,7 @@ extern int  pg_popcount64(uint64 word);
 extern uint64 pg_popcount_optimized(const char *buf, int bytes);
 extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 
mask);
 
-#endif                                                 /* TRY_POPCNT_FAST */
+#endif                                                 /* TRY_POPCNT_X86_64 */
 
 /*
  * Returns the number of 1-bits in buf.
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 5677525693d..82be40e2fb4 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -108,7 +108,7 @@ static inline int pg_popcount64_slow(uint64 word);
 static uint64 pg_popcount_slow(const char *buf, int bytes);
 static uint64 pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask);
 
-#ifdef TRY_POPCNT_FAST
+#ifdef TRY_POPCNT_X86_64
 static bool pg_popcount_available(void);
 static int     pg_popcount32_choose(uint32 word);
 static int     pg_popcount64_choose(uint64 word);
@@ -123,9 +123,9 @@ int                 (*pg_popcount32) (uint32 word) = 
pg_popcount32_choose;
 int                    (*pg_popcount64) (uint64 word) = pg_popcount64_choose;
 uint64         (*pg_popcount_optimized) (const char *buf, int bytes) = 
pg_popcount_choose;
 uint64         (*pg_popcount_masked_optimized) (const char *buf, int bytes, 
bits8 mask) = pg_popcount_masked_choose;
-#endif                                                 /* TRY_POPCNT_FAST */
+#endif                                                 /* TRY_POPCNT_X86_64 */
 
-#ifdef TRY_POPCNT_FAST
+#ifdef TRY_POPCNT_X86_64
 
 /*
  * Return true if CPUID indicates that the POPCNT instruction is available.
@@ -337,7 +337,7 @@ pg_popcount_masked_fast(const char *buf, int bytes, bits8 
mask)
        return popcnt;
 }
 
-#endif                                                 /* TRY_POPCNT_FAST */
+#endif                                                 /* TRY_POPCNT_X86_64 */
 
 
 /*
@@ -486,13 +486,13 @@ pg_popcount_masked_slow(const char *buf, int bytes, bits8 
mask)
        return popcnt;
 }
 
-#ifndef TRY_POPCNT_FAST
+#ifndef TRY_POPCNT_X86_64
 
 /*
  * When the POPCNT instruction is not available, there's no point in using
  * function pointers to vary the implementation between the fast and slow
  * method.  We instead just make these actual external functions when
- * TRY_POPCNT_FAST is not defined.  The compiler should be able to inline
+ * TRY_POPCNT_X86_64 is not defined.  The compiler should be able to inline
  * the slow versions here.
  */
 int
@@ -527,4 +527,4 @@ pg_popcount_masked_optimized(const char *buf, int bytes, 
bits8 mask)
        return pg_popcount_masked_slow(buf, bytes, mask);
 }
 
-#endif                                                 /* !TRY_POPCNT_FAST */
+#endif                                                 /* !TRY_POPCNT_X86_64 */
diff --git a/src/port/pg_popcount_avx512.c b/src/port/pg_popcount_avx512.c
index dac895a0fc2..80c0aee3e73 100644
--- a/src/port/pg_popcount_avx512.c
+++ b/src/port/pg_popcount_avx512.c
@@ -27,11 +27,11 @@
 #include "port/pg_bitutils.h"
 
 /*
- * It's probably unlikely that TRY_POPCNT_FAST won't be set if we are able to
+ * It's probably unlikely that TRY_POPCNT_X86_64 won't be set if we are able to
  * use AVX-512 intrinsics, but we check it anyway to be sure.  We piggy-back on
- * the function pointers that are only used when TRY_POPCNT_FAST is set.
+ * the function pointers that are only used when TRY_POPCNT_X86_64 is set.
  */
-#ifdef TRY_POPCNT_FAST
+#ifdef TRY_POPCNT_X86_64
 
 /*
  * Does CPUID say there's support for XSAVE instructions?
@@ -219,5 +219,5 @@ pg_popcount_masked_avx512(const char *buf, int bytes, bits8 
mask)
        return _mm512_reduce_add_epi64(accum);
 }
 
-#endif                                                 /* TRY_POPCNT_FAST */
+#endif                                                 /* TRY_POPCNT_X86_64 */
 #endif                                                 /* 
USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
-- 
2.39.5 (Apple Git-154)

>From 5953da8e6c4d167954cbedfca58bd7558feb8620 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <[email protected]>
Date: Mon, 24 Mar 2025 20:10:23 -0500
Subject: [PATCH v9 2/3] Add Neon popcount support.

This commit introduces a Neon implementation of pg_popcount{32,64},
pg_popcount(), and pg_popcount_masked().  As in simd.h, we assume
that all available AArch64 hardware supports Neon, so we
conveniently don't need any new configure-time or runtime checks.
Some compilers emit Neon instructions for these functions already,
but our hand-rolled implementations for pg_popcount() and
pg_popcount_masked() performed better in our tests, presumably due
to the instruction-level parallelism.

Author: "[email protected]" 
<[email protected]>
Reviewed-by: John Naylor <[email protected]>
Discussion: 
https://postgr.es/m/010101936e4aaa70-b474ab9e-b9ce-474d-a3ba-a3dc223d295c-000000%40us-west-2.amazonses.com
---
 src/include/port/pg_bitutils.h |   9 ++
 src/port/Makefile              |   1 +
 src/port/meson.build           |   1 +
 src/port/pg_bitutils.c         |  22 +++-
 src/port/pg_popcount_aarch64.c | 208 +++++++++++++++++++++++++++++++++
 5 files changed, 235 insertions(+), 6 deletions(-)
 create mode 100644 src/port/pg_popcount_aarch64.c

diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 3067ff402ba..a387f77c2c0 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -298,6 +298,15 @@ pg_ceil_log2_64(uint64 num)
 #endif
 #endif
 
+/*
+ * On AArch64, we can use Neon instructions if the compiler provides access to
+ * them (as indicated by __ARM_NEON).  As in simd.h, we assume that all
+ * available 64-bit hardware has Neon support.
+ */
+#if defined(__aarch64__) && defined(__ARM_NEON)
+#define POPCNT_AARCH64 1
+#endif
+
 #ifdef TRY_POPCNT_X86_64
 /* Attempt to use the POPCNT instruction, but perform a runtime check first */
 extern PGDLLIMPORT int (*pg_popcount32) (uint32 word);
diff --git a/src/port/Makefile b/src/port/Makefile
index 4c224319512..cb86b7141e6 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -44,6 +44,7 @@ OBJS = \
        noblock.o \
        path.o \
        pg_bitutils.o \
+       pg_popcount_aarch64.o \
        pg_popcount_avx512.o \
        pg_strong_random.o \
        pgcheckdir.o \
diff --git a/src/port/meson.build b/src/port/meson.build
index 7fcfa728d43..cad0dd8f4f8 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -7,6 +7,7 @@ pgport_sources = [
   'noblock.c',
   'path.c',
   'pg_bitutils.c',
+  'pg_popcount_aarch64.c',
   'pg_popcount_avx512.c',
   'pg_strong_random.c',
   'pgcheckdir.c',
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 82be40e2fb4..61c7388f474 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -103,10 +103,15 @@ const uint8 pg_number_of_ones[256] = {
        4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
 };
 
+/*
+ * If we are building the Neon versions, we don't need the "slow" fallbacks.
+ */
+#ifndef POPCNT_AARCH64
 static inline int pg_popcount32_slow(uint32 word);
 static inline int pg_popcount64_slow(uint64 word);
 static uint64 pg_popcount_slow(const char *buf, int bytes);
 static uint64 pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask);
+#endif
 
 #ifdef TRY_POPCNT_X86_64
 static bool pg_popcount_available(void);
@@ -339,6 +344,10 @@ pg_popcount_masked_fast(const char *buf, int bytes, bits8 
mask)
 
 #endif                                                 /* TRY_POPCNT_X86_64 */
 
+/*
+ * If we are building the Neon versions, we don't need the "slow" fallbacks.
+ */
+#ifndef POPCNT_AARCH64
 
 /*
  * pg_popcount32_slow
@@ -486,14 +495,15 @@ pg_popcount_masked_slow(const char *buf, int bytes, bits8 
mask)
        return popcnt;
 }
 
-#ifndef TRY_POPCNT_X86_64
+#endif                                                 /* ! POPCNT_AARCH64 */
+
+#if !defined(TRY_POPCNT_X86_64) && !defined(POPCNT_AARCH64)
 
 /*
- * When the POPCNT instruction is not available, there's no point in using
+ * When special CPU instructions are not available, there's no point in using
  * function pointers to vary the implementation between the fast and slow
- * method.  We instead just make these actual external functions when
- * TRY_POPCNT_X86_64 is not defined.  The compiler should be able to inline
- * the slow versions here.
+ * method.  We instead just make these actual external functions.  The compiler
+ * should be able to inline the slow versions here.
  */
 int
 pg_popcount32(uint32 word)
@@ -527,4 +537,4 @@ pg_popcount_masked_optimized(const char *buf, int bytes, 
bits8 mask)
        return pg_popcount_masked_slow(buf, bytes, mask);
 }
 
-#endif                                                 /* !TRY_POPCNT_X86_64 */
+#endif                                                 /* ! TRY_POPCNT_X86_64 
&& ! POPCNT_AARCH64 */
diff --git a/src/port/pg_popcount_aarch64.c b/src/port/pg_popcount_aarch64.c
new file mode 100644
index 00000000000..cdcfee464e4
--- /dev/null
+++ b/src/port/pg_popcount_aarch64.c
@@ -0,0 +1,208 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_popcount_aarc64.c
+ *       Holds the AArch64 pg_popcount() implementations.
+ *
+ * Copyright (c) 2025, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *       src/port/pg_popcount_aarch64.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#include "port/pg_bitutils.h"
+
+#ifdef POPCNT_AARCH64
+
+#include <arm_neon.h>
+
+/*
+ * pg_popcount32
+ *             Return number of 1 bits in word
+ */
+int
+pg_popcount32(uint32 word)
+{
+       return pg_popcount64((uint64) word);
+}
+
+/*
+ * pg_popcount64
+ *             Return number of 1 bits in word
+ */
+int
+pg_popcount64(uint64 word)
+{
+       /*
+        * For some compilers, __builtin_popcountl() emits Neon instructions
+        * already. The line below should compile to the same code on those
+        * systems.
+        */
+       return vaddv_u8(vcnt_u8(vld1_u8((const uint8 *) &word)));
+}
+
+/*
+ * pg_popcount_optimized
+ *             Returns number of 1 bits in buf
+ */
+uint64
+pg_popcount_optimized(const char *buf, int bytes)
+{
+       uint8x16_t      vec;
+       uint32          bytes_per_iteration = 4 * sizeof(uint8x16_t);
+       uint64x2_t      accum1 = vdupq_n_u64(0),
+                               accum2 = vdupq_n_u64(0),
+                               accum3 = vdupq_n_u64(0),
+                               accum4 = vdupq_n_u64(0);
+       uint64          popcnt = 0;
+
+       /*
+        * For better instruction-level parallelism, each loop iteration 
operates
+        * on a block of four registers.
+        */
+       for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration)
+       {
+               vec = vld1q_u8((const uint8 *) buf);
+               accum1 = vpadalq_u32(accum1, 
vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
+               buf += sizeof(uint8x16_t);
+
+               vec = vld1q_u8((const uint8 *) buf);
+               accum2 = vpadalq_u32(accum2, 
vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
+               buf += sizeof(uint8x16_t);
+
+               vec = vld1q_u8((const uint8 *) buf);
+               accum3 = vpadalq_u32(accum3, 
vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
+               buf += sizeof(uint8x16_t);
+
+               vec = vld1q_u8((const uint8 *) buf);
+               accum4 = vpadalq_u32(accum4, 
vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
+               buf += sizeof(uint8x16_t);
+       }
+
+       /*
+        * If enough data remains, do another iteration on a block of two
+        * registers.
+        */
+       bytes_per_iteration = 2 * sizeof(uint8x16_t);
+       if (bytes >= bytes_per_iteration)
+       {
+               vec = vld1q_u8((const uint8 *) buf);
+               accum1 = vpadalq_u32(accum1, 
vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
+               buf += sizeof(uint8x16_t);
+
+               vec = vld1q_u8((const uint8 *) buf);
+               accum2 = vpadalq_u32(accum2, 
vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
+               buf += sizeof(uint8x16_t);
+
+               bytes -= bytes_per_iteration;
+       }
+
+       /*
+        * Add the accumulators.
+        */
+       popcnt += vaddvq_u64(vaddq_u64(accum1, accum2));
+       popcnt += vaddvq_u64(vaddq_u64(accum3, accum4));
+
+       /*
+        * Process remaining 8-byte blocks.
+        */
+       for (; bytes >= sizeof(uint64); bytes -= sizeof(uint64))
+       {
+               popcnt += pg_popcount64(*((uint64 *) buf));
+               buf += sizeof(uint64);
+       }
+
+       /*
+        * Process any remaining data byte-by-byte.
+        */
+       while (bytes--)
+               popcnt += pg_number_of_ones[(unsigned char) *buf++];
+
+       return popcnt;
+}
+
+/*
+ * pg_popcount_masked_optimized
+ *             Returns number of 1 bits in buf after applying the mask to each 
byte
+ */
+uint64
+pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask)
+{
+       uint8x16_t      vec;
+       uint32          bytes_per_iteration = 4 * sizeof(uint8x16_t);
+       uint64x2_t      accum1 = vdupq_n_u64(0),
+                               accum2 = vdupq_n_u64(0),
+                               accum3 = vdupq_n_u64(0),
+                               accum4 = vdupq_n_u64(0);
+       uint64          popcnt = 0,
+                               mask64 = ~UINT64CONST(0) / 0xFF * mask;
+       uint8x16_t      maskv = vdupq_n_u8(mask);
+
+       /*
+        * For better instruction-level parallelism, each loop iteration 
operates
+        * on a block of four registers.
+        */
+       for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration)
+       {
+               vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv);
+               accum1 = vpadalq_u32(accum1, 
vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
+               buf += sizeof(uint8x16_t);
+
+               vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv);
+               accum2 = vpadalq_u32(accum2, 
vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
+               buf += sizeof(uint8x16_t);
+
+               vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv);
+               accum3 = vpadalq_u32(accum3, 
vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
+               buf += sizeof(uint8x16_t);
+
+               vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv);
+               accum4 = vpadalq_u32(accum4, 
vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
+               buf += sizeof(uint8x16_t);
+       }
+
+       /*
+        * If enough data remains, do another iteration on a block of two
+        * registers.
+        */
+       bytes_per_iteration = 2 * sizeof(uint8x16_t);
+       if (bytes >= bytes_per_iteration)
+       {
+               vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv);
+               accum1 = vpadalq_u32(accum1, 
vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
+               buf += sizeof(uint8x16_t);
+
+               vec = vandq_u8(vld1q_u8((const uint8 *) buf), maskv);
+               accum2 = vpadalq_u32(accum2, 
vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec))));
+               buf += sizeof(uint8x16_t);
+
+               bytes -= bytes_per_iteration;
+       }
+
+       /*
+        * Add the accumulators.
+        */
+       popcnt += vaddvq_u64(vaddq_u64(accum1, accum2));
+       popcnt += vaddvq_u64(vaddq_u64(accum3, accum4));
+
+       /*
+        * Process remining 8-byte blocks.
+        */
+       for (; bytes >= sizeof(uint64); bytes -= sizeof(uint64))
+       {
+               popcnt += pg_popcount64(*((uint64 *) buf) & mask64);
+               buf += sizeof(uint64);
+       }
+
+       /*
+        * Process any remaining data byte-by-byte.
+        */
+       while (bytes--)
+               popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask];
+
+       return popcnt;
+}
+
+#endif                                                 /* POPCNT_AARCH64 */
-- 
2.39.5 (Apple Git-154)

>From 1b2c3a8101fb7a3844de4594141492f72981af12 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <[email protected]>
Date: Mon, 24 Mar 2025 20:30:22 -0500
Subject: [PATCH v9 3/3] Add SVE popcount support.

This commit introduces an SVE implementation of pg_popcount{32,64}.
Unlike Neon support, we need an additional configure-time check to
discover whether the compiler supports SVE intrinsics, and we need
a runtime check to find whether the current CPU supports SVE
instructions.  While this commit introduces a new function pointer
so that the implementation can be chosen at runtime, the
AArch64-specific implementations are fast enough to avoid any
measurable regressions as compared to previous versions of
PostgreSQL.  The SVE implementations are much faster for larger
inputs, including the uses for the visibility map.

Author: "[email protected]" 
<[email protected]>
Co-authored-by: "Malladi, Rama" <[email protected]>
Co-authored-by: "[email protected]" <[email protected]>
Reviewed-by: Kirill Reshke <[email protected]>
Reviewed-by: John Naylor <[email protected]>
Discussion: 
https://postgr.es/m/010101936e4aaa70-b474ab9e-b9ce-474d-a3ba-a3dc223d295c-000000%40us-west-2.amazonses.com
Discussion: 
https://postgr.es/m/OSZPR01MB84990A9A02A3515C6E85A65B8B2A2%40OSZPR01MB8499.jpnprd01.prod.outlook.com
---
 config/c-compiler.m4           |  53 ++++++++
 configure                      |  73 ++++++++++
 configure.ac                   |   9 ++
 meson.build                    |  50 +++++++
 src/include/pg_config.h.in     |   3 +
 src/include/port/pg_bitutils.h |  17 +++
 src/port/pg_popcount_aarch64.c | 235 ++++++++++++++++++++++++++++++++-
 7 files changed, 434 insertions(+), 6 deletions(-)

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 3712e81e38c..8490354a1e0 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -708,3 +708,56 @@ if test x"$Ac_cachevar" = x"yes"; then
 fi
 undefine([Ac_cachevar])dnl
 ])# PGAC_AVX512_POPCNT_INTRINSICS
+
+# PGAC_SVE_POPCNT_INTRINSICS
+# --------------------------
+# Check if the compiler supports the SVE popcount instructions using the
+# svptrue_b64, svdup_u64, svcntb, svld1, svadd_x, svcnt_x, svaddv,
+# svwhilelt_b8, and svand_x intrinsic functions.
+#
+# If the intrinsics are supported, sets pgac_sve_popcnt_intrinsics.
+AC_DEFUN([PGAC_SVE_POPCNT_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sve_popcnt_intrinsics])])dnl
+AC_CACHE_CHECK([for svcnt_x], [Ac_cachevar],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include <arm_sve.h>
+
+       char buf[128];
+
+       #if defined(__has_attribute) && __has_attribute (target)
+       __attribute__((target("arch=armv8-a+sve")))
+       #endif
+       static int popcount_test(void)
+       {
+               uint32_t        vec_len = svcntb();
+               int                     bytes = sizeof(buf);
+               svuint64_t      accum1 = svdup_u64(0),
+                                       accum2 = svdup_u64(0),
+                                       vec64;
+               svuint8_t       vec8;
+               svbool_t        pred = svptrue_b64();
+               uint64_t        popcnt = 0,
+                                       mask = 0x5555555555555555;
+               char       *p = buf;
+
+               vec64 = svand_x(pred, svld1(pred, (const uint64_t *) p), mask);
+               accum1 = svadd_x(pred, accum1, svcnt_x(pred, vec64));
+               p += vec_len;
+
+               vec64 = svand_x(pred, svld1(pred, (const uint64_t *) p), mask);
+               accum2 = svadd_x(pred, accum2, svcnt_x(pred, vec64));
+               p += vec_len;
+
+               popcnt += svaddv(pred, svadd_x(pred, accum1, accum2));
+
+               pred = svwhilelt_b8(0, bytes);
+               vec8 = svand_x(pred, svld1(pred, (const uint8_t *) p), 0x55);
+               return (int) (popcnt + svaddv(pred, svcnt_x(pred, vec8)));
+       }]],
+  [return popcount_test();])],
+  [Ac_cachevar=yes],
+  [Ac_cachevar=no])])
+if test x"$Ac_cachevar" = x"yes"; then
+  pgac_sve_popcnt_intrinsics=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_SVE_POPCNT_INTRINSICS
diff --git a/configure b/configure
index fac1e9a4e39..2e291f97c99 100755
--- a/configure
+++ b/configure
@@ -17378,6 +17378,79 @@ $as_echo "#define USE_AVX512_POPCNT_WITH_RUNTIME_CHECK 
1" >>confdefs.h
   fi
 fi
 
+# Check for SVE popcount intrinsics
+#
+if test x"$host_cpu" = x"aarch64"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for svcnt_x" >&5
+$as_echo_n "checking for svcnt_x... " >&6; }
+if ${pgac_cv_sve_popcnt_intrinsics+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <arm_sve.h>
+
+       char buf[128];
+
+       #if defined(__has_attribute) && __has_attribute (target)
+       __attribute__((target("arch=armv8-a+sve")))
+       #endif
+       static int popcount_test(void)
+       {
+               uint32_t        vec_len = svcntb();
+               int                     bytes = sizeof(buf);
+               svuint64_t      accum1 = svdup_u64(0),
+                                       accum2 = svdup_u64(0),
+                                       vec64;
+               svuint8_t       vec8;
+               svbool_t        pred = svptrue_b64();
+               uint64_t        popcnt = 0,
+                                       mask = 0x5555555555555555;
+               char       *p = buf;
+
+               vec64 = svand_x(pred, svld1(pred, (const uint64_t *) p), mask);
+               accum1 = svadd_x(pred, accum1, svcnt_x(pred, vec64));
+               p += vec_len;
+
+               vec64 = svand_x(pred, svld1(pred, (const uint64_t *) p), mask);
+               accum2 = svadd_x(pred, accum2, svcnt_x(pred, vec64));
+               p += vec_len;
+
+               popcnt += svaddv(pred, svadd_x(pred, accum1, accum2));
+
+               pred = svwhilelt_b8(0, bytes);
+               vec8 = svand_x(pred, svld1(pred, (const uint8_t *) p), 0x55);
+               return (int) (popcnt + svaddv(pred, svcnt_x(pred, vec8)));
+       }
+int
+main ()
+{
+return popcount_test();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_sve_popcnt_intrinsics=yes
+else
+  pgac_cv_sve_popcnt_intrinsics=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: 
$pgac_cv_sve_popcnt_intrinsics" >&5
+$as_echo "$pgac_cv_sve_popcnt_intrinsics" >&6; }
+if test x"$pgac_cv_sve_popcnt_intrinsics" = x"yes"; then
+  pgac_sve_popcnt_intrinsics=yes
+fi
+
+  if test x"$pgac_sve_popcnt_intrinsics" = x"yes"; then
+
+$as_echo "#define USE_SVE_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+  fi
+fi
+
 # Check for Intel SSE 4.2 intrinsics to do CRC calculations.
 #
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and 
_mm_crc32_u32" >&5
diff --git a/configure.ac b/configure.ac
index b6d02f5ecc7..64b52940658 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2057,6 +2057,15 @@ if test x"$host_cpu" = x"x86_64"; then
   fi
 fi
 
+# Check for SVE popcount intrinsics
+#
+if test x"$host_cpu" = x"aarch64"; then
+  PGAC_SVE_POPCNT_INTRINSICS()
+  if test x"$pgac_sve_popcnt_intrinsics" = x"yes"; then
+    AC_DEFINE(USE_SVE_POPCNT_WITH_RUNTIME_CHECK, 1, [Define to 1 to use SVE 
popcount instructions with a runtime check.])
+  fi
+fi
+
 # Check for Intel SSE 4.2 intrinsics to do CRC calculations.
 #
 PGAC_SSE42_CRC32_INTRINSICS()
diff --git a/meson.build b/meson.build
index 7cf518a2765..f8f1dce6bc9 100644
--- a/meson.build
+++ b/meson.build
@@ -2285,6 +2285,56 @@ int main(void)
 endif
 
 
+###############################################################
+# Check for the availability of SVE popcount intrinsics.
+###############################################################
+
+if host_cpu == 'aarch64'
+
+  prog = '''
+#include <arm_sve.h>
+
+char buf[128];
+
+#if defined(__has_attribute) && __has_attribute (target)
+__attribute__((target("arch=armv8-a+sve")))
+#endif
+int main(void)
+{
+       uint32_t        vec_len = svcntb();
+       int                     bytes = sizeof(buf);
+       svuint64_t      accum1 = svdup_u64(0),
+                               accum2 = svdup_u64(0),
+                               vec64;
+       svuint8_t       vec8;
+       svbool_t        pred = svptrue_b64();
+       uint64_t        popcnt = 0,
+                               mask = 0x5555555555555555;
+       char       *p = buf;
+
+       vec64 = svand_x(pred, svld1(pred, (const uint64_t *) p), mask);
+       accum1 = svadd_x(pred, accum1, svcnt_x(pred, vec64));
+       p += vec_len;
+
+       vec64 = svand_x(pred, svld1(pred, (const uint64_t *) p), mask);
+       accum2 = svadd_x(pred, accum2, svcnt_x(pred, vec64));
+       p += vec_len;
+
+       popcnt += svaddv(pred, svadd_x(pred, accum1, accum2));
+
+       pred = svwhilelt_b8(0, bytes);
+       vec8 = svand_x(pred, svld1(pred, (const uint8_t *) p), 0x55);
+       return (int) (popcnt + svaddv(pred, svcnt_x(pred, vec8)));
+}
+'''
+
+  if cc.links(prog, name: 'SVE popcount', args: test_c_args)
+    cdata.set('USE_SVE_POPCNT_WITH_RUNTIME_CHECK', 1)
+  endif
+
+endif
+
+
 ###############################################################
 # Select CRC-32C implementation.
 #
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index db6454090d2..2a67db077a9 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -706,6 +706,9 @@
 /* Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check. */
 #undef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
 
+/* Define to 1 to use SVE popcount instructions with a runtime check. */
+#undef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
+
 /* Define to build with systemd support. (--with-systemd) */
 #undef USE_SYSTEMD
 
diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index a387f77c2c0..c7901bf8ddc 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -324,6 +324,23 @@ extern uint64 pg_popcount_avx512(const char *buf, int 
bytes);
 extern uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 
mask);
 #endif
 
+#elif POPCNT_AARCH64
+/* Use the Neon version of pg_popcount{32,64} without function pointer. */
+extern int     pg_popcount32(uint32 word);
+extern int     pg_popcount64(uint64 word);
+
+/*
+ * We can try to use an SVE-optimized pg_popcount() on some systems  For that,
+ * we do use a function pointer.
+ */
+#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
+extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int 
bytes);
+extern PGDLLIMPORT uint64 (*pg_popcount_masked_optimized) (const char *buf, 
int bytes, bits8 mask);
+#else
+extern uint64 pg_popcount_optimized(const char *buf, int bytes);
+extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 
mask);
+#endif
+
 #else
 /* Use a portable implementation -- no need for a function pointer. */
 extern int     pg_popcount32(uint32 word);
diff --git a/src/port/pg_popcount_aarch64.c b/src/port/pg_popcount_aarch64.c
index cdcfee464e4..2b7a2f97b83 100644
--- a/src/port/pg_popcount_aarch64.c
+++ b/src/port/pg_popcount_aarch64.c
@@ -18,6 +18,229 @@
 
 #include <arm_neon.h>
 
+#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
+#include <arm_sve.h>
+
+#if defined(HAVE_ELF_AUX_INFO) || defined(HAVE_GETAUXVAL)
+#include <sys/auxv.h>
+#endif
+#endif
+
+/*
+ * The Neon versions are built regardless of whether we are building the SVE
+ * versions.
+ */
+static uint64 pg_popcount_neon(const char *buf, int bytes);
+static uint64 pg_popcount_masked_neon(const char *buf, int bytes, bits8 mask);
+
+#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
+
+/*
+ * These are the SVE implementations of the popcount functions.
+ */
+static uint64 pg_popcount_sve(const char *buf, int bytes);
+static uint64 pg_popcount_masked_sve(const char *buf, int bytes, bits8 mask);
+
+/*
+ * The function pointers are initially set to "choose" functions.  These
+ * functions will first set the pointers to the right implementations (based on
+ * what the current CPU supports) and then will call the pointer to fulfill the
+ * caller's request.
+ */
+static uint64 pg_popcount_choose(const char *buf, int bytes);
+static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 
mask);
+uint64         (*pg_popcount_optimized) (const char *buf, int bytes) = 
pg_popcount_choose;
+uint64         (*pg_popcount_masked_optimized) (const char *buf, int bytes, 
bits8 mask) = pg_popcount_masked_choose;
+
+static inline bool
+pg_popcount_sve_available(void)
+{
+#ifdef HAVE_ELF_AUX_INFO
+       unsigned long value;
+
+       return elf_aux_info(AT_HWCAP, &value, sizeof(value)) == 0 &&
+               (value & HWCAP_SVE) != 0;
+#elif defined(HAVE_GETAUXVAL)
+       return (getauxval(AT_HWCAP) & HWCAP_SVE) != 0;
+#else
+       return false;
+#endif
+}
+
+static inline void
+choose_popcount_functions(void)
+{
+       if (pg_popcount_sve_available())
+       {
+               pg_popcount_optimized = pg_popcount_sve;
+               pg_popcount_masked_optimized = pg_popcount_masked_sve;
+       }
+       else
+       {
+               pg_popcount_optimized = pg_popcount_neon;
+               pg_popcount_masked_optimized = pg_popcount_masked_neon;
+       }
+}
+
+static uint64
+pg_popcount_choose(const char *buf, int bytes)
+{
+       choose_popcount_functions();
+       return pg_popcount_optimized(buf, bytes);
+}
+
+static uint64
+pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask)
+{
+       choose_popcount_functions();
+       return pg_popcount_masked_optimized(buf, bytes, mask);
+}
+
+/*
+ * pg_popcount_sve
+ *             Returns number of 1 bits in buf
+ */
+pg_attribute_target("arch=armv8-a+sve")
+static uint64
+pg_popcount_sve(const char *buf, int bytes)
+{
+       uint32          vec_len = svcntb(),
+                               bytes_per_iteration = 4 * vec_len;
+       svuint64_t      accum1 = svdup_u64(0),
+                               accum2 = svdup_u64(0),
+                               accum3 = svdup_u64(0),
+                               accum4 = svdup_u64(0);
+       svbool_t        pred = svptrue_b64();
+       uint64          popcnt = 0;
+
+       /*
+        * For better instruction-level parallelism, each loop iteration 
operates
+        * on a block of four registers.
+        */
+       for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration)
+       {
+               svuint64_t      vec;
+
+               vec = svld1(pred, (const uint64 *) buf);
+               accum1 = svadd_x(pred, accum1, svcnt_x(pred, vec));
+               buf += vec_len;
+
+               vec = svld1(pred, (const uint64 *) buf);
+               accum2 = svadd_x(pred, accum2, svcnt_x(pred, vec));
+               buf += vec_len;
+
+               vec = svld1(pred, (const uint64 *) buf);
+               accum3 = svadd_x(pred, accum3, svcnt_x(pred, vec));
+               buf += vec_len;
+
+               vec = svld1(pred, (const uint64 *) buf);
+               accum4 = svadd_x(pred, accum4, svcnt_x(pred, vec));
+               buf += vec_len;
+       }
+
+       popcnt += svaddv(pred, svadd_x(pred, accum1, accum2));
+       popcnt += svaddv(pred, svadd_x(pred, accum3, accum4));
+
+       /*
+        * Process any remaining data.
+        */
+       for (; bytes >= vec_len; bytes -= vec_len)
+       {
+               svuint8_t       vec;
+
+               pred = svwhilelt_b8(0, bytes);
+               vec = svld1(pred, (const uint8 *) buf);
+               popcnt += svaddv(pred, svcnt_x(pred, vec));
+               buf += vec_len;
+       }
+
+       return popcnt;
+}
+
+/*
+ * pg_popcount_masked_sve
+ *             Returns number of 1 bits in buf after applying the mask to each 
byte
+ */
+pg_attribute_target("arch=armv8-a+sve")
+static uint64
+pg_popcount_masked_sve(const char *buf, int bytes, bits8 mask)
+{
+       uint32          vec_len = svcntb(),
+                               bytes_per_iteration = 4 * vec_len;
+       svuint64_t      accum1 = svdup_u64(0),
+                               accum2 = svdup_u64(0),
+                               accum3 = svdup_u64(0),
+                               accum4 = svdup_u64(0);
+       svbool_t        pred = svptrue_b64();
+       uint64          popcnt = 0,
+                               mask64 = ~UINT64CONST(0) / 0xFF * mask;
+
+       /*
+        * For better instruction-level parallelism, each loop iteration 
operates
+        * on a block of four registers.
+        */
+       for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration)
+       {
+               svuint64_t      vec;
+
+               vec = svand_x(pred, svld1(pred, (const uint64 *) buf), mask64);
+               accum1 = svadd_x(pred, accum1, svcnt_x(pred, vec));
+               buf += vec_len;
+
+               vec = svand_x(pred, svld1(pred, (const uint64 *) buf), mask64);
+               accum2 = svadd_x(pred, accum2, svcnt_x(pred, vec));
+               buf += vec_len;
+
+               vec = svand_x(pred, svld1(pred, (const uint64 *) buf), mask64);
+               accum3 = svadd_x(pred, accum3, svcnt_x(pred, vec));
+               buf += vec_len;
+
+               vec = svand_x(pred, svld1(pred, (const uint64 *) buf), mask64);
+               accum4 = svadd_x(pred, accum4, svcnt_x(pred, vec));
+               buf += vec_len;
+       }
+
+       popcnt += svaddv(pred, svadd_x(pred, accum1, accum2));
+       popcnt += svaddv(pred, svadd_x(pred, accum3, accum4));
+
+       /*
+        * Process any remaining data.
+        */
+       for (; bytes >= vec_len; bytes -= vec_len)
+       {
+               svuint8_t       vec;
+
+               pred = svwhilelt_b8(0, bytes);
+               vec = svand_x(pred, svld1(pred, (const uint8 *) buf), mask);
+               popcnt += svaddv(pred, svcnt_x(pred, vec));
+               buf += vec_len;
+       }
+
+       return popcnt;
+}
+
+#else                                                  /* 
USE_SVE_POPCNT_WITH_RUNTIME_CHECK */
+
+/*
+ * When the SVE version isn't available, there's no point in using function
+ * pointers to vary the implementation.  We instead just make these actual
+ * external functions when USE_SVE_POPCNT_WITH_RUNTIME_CHECK is not defined.
+ * The compiler should be able to inline the slow versions here.
+ */
+uint64
+pg_popcount_optimized(const char *buf, int bytes)
+{
+       return pg_popcount_neon(buf, bytes);
+}
+
+uint64
+pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask)
+{
+       return pg_popcount_masked_neon(buf, bytes, mask);
+}
+
+#endif                                                 /* ! 
USE_SVE_POPCNT_WITH_RUNTIME_CHECK */
+
 /*
  * pg_popcount32
  *             Return number of 1 bits in word
@@ -44,11 +267,11 @@ pg_popcount64(uint64 word)
 }
 
 /*
- * pg_popcount_optimized
+ * pg_popcount_neon
  *             Returns number of 1 bits in buf
  */
-uint64
-pg_popcount_optimized(const char *buf, int bytes)
+static uint64
+pg_popcount_neon(const char *buf, int bytes)
 {
        uint8x16_t      vec;
        uint32          bytes_per_iteration = 4 * sizeof(uint8x16_t);
@@ -124,11 +347,11 @@ pg_popcount_optimized(const char *buf, int bytes)
 }
 
 /*
- * pg_popcount_masked_optimized
+ * pg_popcount_masked_neon
  *             Returns number of 1 bits in buf after applying the mask to each 
byte
  */
-uint64
-pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask)
+static uint64
+pg_popcount_masked_neon(const char *buf, int bytes, bits8 mask)
 {
        uint8x16_t      vec;
        uint32          bytes_per_iteration = 4 * sizeof(uint8x16_t);
-- 
2.39.5 (Apple Git-154)

Re: [PATCH] SVE popcount support

Reply via email to