Currently, all platforms must indirect through a function pointer to call
popcount on a word-sized input, even though we don't arrange for a fast
implementation on non-x86 to make it worthwhile.
0001 moves some declarations around so that "slow" popcount functions are
called directly on non-x86 platforms.
0002 was an idea to simplify and unify the coding for the slow functions.
Also attached is a test module for building microbenchmarks.
On a Power8 machine using gcc 4.8, and running
time ./inst/bin/psql -c 'select drive_popcount(100000, 1024)'
I get
master: 647ms
0001: 183ms
0002: 228ms
So 0001 is a clear winner on that platform. 0002 is still good, but slower
than 0001 for some reason, and it turns out that on master, gcc does emit a
popcnt instruction from the intrinsic:
0000000000000000 <pg_popcount32_slow>:
0: f4 02 63 7c popcntw r3,r3
4: b4 07 63 7c extsw r3,r3
8: 20 00 80 4e blr
...
The gcc docs mention a flag for this, but I'm not sure why it seems not to
need it:
https://gcc.gnu.org/onlinedocs/gcc/RS_002f6000-and-PowerPC-Options.html#RS_002f6000-and-PowerPC-Options
Maybe that's because the machine I used was ppc64le, but I'm not sure a ppc
binary built like this is portable to other hardware. For that reason,
maybe 0002 is a good idea.
--
John Naylor
EDB: http://www.enterprisedb.com
diff --git a/src/test/modules/test_popcount/Makefile b/src/test/modules/test_popcount/Makefile
new file mode 100644
index 0000000000..8a49d1ccfc
--- /dev/null
+++ b/src/test/modules/test_popcount/Makefile
@@ -0,0 +1,21 @@
+MODULE_big = test_popcount
+OBJS = test_popcount.o
+PGFILEDESC = "test"
+EXTENSION = test_popcount
+DATA = test_popcount--1.0.sql
+
+first: all
+
+# needed?
+test_popcount.o: test_popcount.c
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/test_popcount
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/src/test/modules/test_popcount/test_popcount--1.0.sql b/src/test/modules/test_popcount/test_popcount--1.0.sql
new file mode 100644
index 0000000000..a5975e54cd
--- /dev/null
+++ b/src/test/modules/test_popcount/test_popcount--1.0.sql
@@ -0,0 +1,2 @@
+CREATE FUNCTION drive_popcount (count int, num int) RETURNS bigint AS 'MODULE_PATHNAME' LANGUAGE C;
+CREATE FUNCTION drive_popcount64(count int, num int) RETURNS bigint AS 'MODULE_PATHNAME' LANGUAGE C;
diff --git a/src/test/modules/test_popcount/test_popcount.c b/src/test/modules/test_popcount/test_popcount.c
new file mode 100644
index 0000000000..0030ed142f
--- /dev/null
+++ b/src/test/modules/test_popcount/test_popcount.c
@@ -0,0 +1,71 @@
+/* select drive_popcount(1000000, 1024); */
+
+#include "postgres.h"
+
+#include "fmgr.h"
+
+#include "port/pg_bitutils.h"
+
+PG_MODULE_MAGIC;
+
+#ifndef PG_POPCOUNT64
+#define PG_POPCOUNT32(word) pg_popcount32(word)
+#define PG_POPCOUNT64(word) pg_popcount64(word)
+#endif
+
+
+/*
+ * drive_popcount64(count int, num int) returns bigint
+ *
+ * num is the number of 64-bit words to use
+ */
+PG_FUNCTION_INFO_V1(drive_popcount64);
+Datum
+drive_popcount64(PG_FUNCTION_ARGS)
+{
+
+ int count = PG_GETARG_INT32(0);
+ int num = PG_GETARG_INT32(1);
+
+ int len = num * sizeof(uint64);
+ uint64 *words = palloc(len);
+ int64 popcount;
+
+ for (int i = 0; i < num; i++)
+ words[i] = i;
+
+ while (count--)
+ {
+ popcount = 0;
+ for (int i = 0; i < num; i++)
+ popcount += PG_POPCOUNT64(words[i]);
+ }
+
+ PG_RETURN_INT64(popcount);
+}
+
+/*
+ * drive_popcount(count int, len int) returns bigint
+ *
+ * num is the number of 64-bit words to use
+ */
+PG_FUNCTION_INFO_V1(drive_popcount);
+Datum
+drive_popcount(PG_FUNCTION_ARGS)
+{
+
+ int count = PG_GETARG_INT32(0);
+ int num = PG_GETARG_INT32(1);
+
+ int len = num * sizeof(uint64);
+ uint64 *words = palloc(len);
+ int64 popcount;
+
+ for (int i = 0; i < num; i++)
+ words[i] = i;
+
+ while (count--)
+ popcount = pg_popcount((const char*) words, len);
+
+ PG_RETURN_INT64(popcount);
+}
diff --git a/src/test/modules/test_popcount/test_popcount.control b/src/test/modules/test_popcount/test_popcount.control
new file mode 100644
index 0000000000..6837d724b4
--- /dev/null
+++ b/src/test/modules/test_popcount/test_popcount.control
@@ -0,0 +1,4 @@
+comment = 'test'
+default_version = '1.0'
+module_pathname = '$libdir/test_popcount'
+relocatable = true
From f65964c997cf8958dd9e53a04fb7e92098c784f1 Mon Sep 17 00:00:00 2001
From: John Naylor <[email protected]>
Date: Wed, 11 Aug 2021 12:17:51 -0400
Subject: [PATCH v1 1/2] Use direct function calls for pg_popcount{32,64} on
non-x86 platforms
Previously, all pg_popcount{32,64} calls were indirected through
a function pointer, even though we had no fast implementation for
non-x86 platforms. To fix, use macros to cover both the direct and
indirect cases, and define them appropriately similar to the CRC code.
---
src/backend/access/heap/visibilitymap.c | 6 ++--
src/backend/nodes/bitmapset.c | 4 +--
src/include/port/pg_bitutils.h | 42 +++++++++++++++++++++-
src/port/pg_bitutils.c | 46 ++++---------------------
4 files changed, 52 insertions(+), 46 deletions(-)
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index 114fbbdd30..75b0377807 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -411,14 +411,14 @@ visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_fro
if (all_frozen == NULL)
{
for (i = 0; i < MAPSIZE / sizeof(uint64); i++)
- nvisible += pg_popcount64(map[i] & VISIBLE_MASK64);
+ nvisible += PG_POPCOUNT64(map[i] & VISIBLE_MASK64);
}
else
{
for (i = 0; i < MAPSIZE / sizeof(uint64); i++)
{
- nvisible += pg_popcount64(map[i] & VISIBLE_MASK64);
- nfrozen += pg_popcount64(map[i] & FROZEN_MASK64);
+ nvisible += PG_POPCOUNT64(map[i] & VISIBLE_MASK64);
+ nfrozen += PG_POPCOUNT64(map[i] & FROZEN_MASK64);
}
}
diff --git a/src/backend/nodes/bitmapset.c b/src/backend/nodes/bitmapset.c
index 649478b0d4..a6c4c62914 100644
--- a/src/backend/nodes/bitmapset.c
+++ b/src/backend/nodes/bitmapset.c
@@ -57,11 +57,11 @@
#if BITS_PER_BITMAPWORD == 32
#define bmw_leftmost_one_pos(w) pg_leftmost_one_pos32(w)
#define bmw_rightmost_one_pos(w) pg_rightmost_one_pos32(w)
-#define bmw_popcount(w) pg_popcount32(w)
+#define bmw_popcount(w) PG_POPCOUNT32(w)
#elif BITS_PER_BITMAPWORD == 64
#define bmw_leftmost_one_pos(w) pg_leftmost_one_pos64(w)
#define bmw_rightmost_one_pos(w) pg_rightmost_one_pos64(w)
-#define bmw_popcount(w) pg_popcount64(w)
+#define bmw_popcount(w) PG_POPCOUNT64(w)
#else
#error "invalid BITS_PER_BITMAPWORD"
#endif
diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 086bd08132..ea97192b3a 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -253,9 +253,49 @@ pg_ceil_log2_64(uint64 num)
return pg_leftmost_one_pos64(num - 1) + 1;
}
-/* Count the number of one-bits in a uint32 or uint64 */
+/*
+ * With MSVC on x86_64 builds, try using native popcnt instructions via the
+ * __popcnt and __popcnt64 intrinsics. These don't work the same as GCC's
+ * __builtin_popcount* intrinsic functions as they always emit popcnt
+ * instructions.
+ */
+#if defined(_MSC_VER) && defined(_M_AMD64)
+#define HAVE_X86_64_POPCNTQ
+#endif
+
+/*
+ * On x86_64, we can use the hardware popcount instruction, but only if
+ * we can verify that the CPU supports it via the cpuid instruction.
+ *
+ * Otherwise, we fall back to a hand-rolled implementation.
+ */
+#ifdef HAVE_X86_64_POPCNTQ
+#if defined(HAVE__GET_CPUID) || defined(HAVE__CPUID)
+#define TRY_POPCNT_FAST 1
+#endif
+#endif
+
+#ifdef TRY_POPCNT_FAST
+/* Use the POPCNT instruction, but perform a runtime check first. */
+#define PG_POPCOUNT32(word) pg_popcount32(word)
+#define PG_POPCOUNT64(word) pg_popcount64(word)
+
+extern int pg_popcount32_slow(uint32 word);
+extern int pg_popcount64_slow(uint64 word);
extern int (*pg_popcount32) (uint32 word);
extern int (*pg_popcount64) (uint64 word);
+extern int pg_popcount32_fast(uint32 word);
+extern int pg_popcount64_fast(uint64 word);
+
+#else
+/* Use a portable implementation */
+#define PG_POPCOUNT32(word) pg_popcount32_slow(word)
+#define PG_POPCOUNT64(word) pg_popcount64_slow(word)
+
+extern int pg_popcount32_slow(uint32 word);
+extern int pg_popcount64_slow(uint64 word);
+
+#endif /* TRY_POPCNT_FAST */
/* Count the number of one-bits in a byte array */
extern uint64 pg_popcount(const char *buf, int bytes);
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 10676b285c..3e90de5249 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -103,47 +103,13 @@ const uint8 pg_number_of_ones[256] = {
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
};
-/*
- * With MSVC on x86_64 builds, try using native popcnt instructions via the
- * __popcnt and __popcnt64 intrinsics. These don't work the same as GCC's
- * __builtin_popcount* intrinsic functions as they always emit popcnt
- * instructions.
- */
-#if defined(_MSC_VER) && defined(_M_AMD64)
-#define HAVE_X86_64_POPCNTQ
-#endif
-
-/*
- * On x86_64, we can use the hardware popcount instruction, but only if
- * we can verify that the CPU supports it via the cpuid instruction.
- *
- * Otherwise, we fall back to __builtin_popcount if the compiler has that,
- * or a hand-rolled implementation if not.
- */
-#ifdef HAVE_X86_64_POPCNTQ
-#if defined(HAVE__GET_CPUID) || defined(HAVE__CPUID)
-#define TRY_POPCNT_FAST 1
-#endif
-#endif
-
-static int pg_popcount32_slow(uint32 word);
-static int pg_popcount64_slow(uint64 word);
-
#ifdef TRY_POPCNT_FAST
static bool pg_popcount_available(void);
static int pg_popcount32_choose(uint32 word);
static int pg_popcount64_choose(uint64 word);
-static int pg_popcount32_fast(uint32 word);
-static int pg_popcount64_fast(uint64 word);
int (*pg_popcount32) (uint32 word) = pg_popcount32_choose;
int (*pg_popcount64) (uint64 word) = pg_popcount64_choose;
-#else
-int (*pg_popcount32) (uint32 word) = pg_popcount32_slow;
-int (*pg_popcount64) (uint64 word) = pg_popcount64_slow;
-#endif /* TRY_POPCNT_FAST */
-
-#ifdef TRY_POPCNT_FAST
/*
* Return true if CPUID indicates that the POPCNT instruction is available.
@@ -208,7 +174,7 @@ pg_popcount64_choose(uint64 word)
* pg_popcount32_fast
* Return the number of 1 bits set in word
*/
-static int
+int
pg_popcount32_fast(uint32 word)
{
#ifdef _MSC_VER
@@ -225,7 +191,7 @@ __asm__ __volatile__(" popcntl %1,%0\n":"=q"(res):"rm"(word):"cc");
* pg_popcount64_fast
* Return the number of 1 bits set in word
*/
-static int
+int
pg_popcount64_fast(uint64 word)
{
#ifdef _MSC_VER
@@ -245,7 +211,7 @@ __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc");
* pg_popcount32_slow
* Return the number of 1 bits set in word
*/
-static int
+int
pg_popcount32_slow(uint32 word)
{
#ifdef HAVE__BUILTIN_POPCOUNT
@@ -267,7 +233,7 @@ pg_popcount32_slow(uint32 word)
* pg_popcount64_slow
* Return the number of 1 bits set in word
*/
-static int
+int
pg_popcount64_slow(uint64 word)
{
#ifdef HAVE__BUILTIN_POPCOUNT
@@ -309,7 +275,7 @@ pg_popcount(const char *buf, int bytes)
while (bytes >= 8)
{
- popcnt += pg_popcount64(*words++);
+ popcnt += PG_POPCOUNT64(*words++);
bytes -= 8;
}
@@ -323,7 +289,7 @@ pg_popcount(const char *buf, int bytes)
while (bytes >= 4)
{
- popcnt += pg_popcount32(*words++);
+ popcnt += PG_POPCOUNT32(*words++);
bytes -= 4;
}
--
2.31.1
From 225073b1741fd89ec7728e28978e057e9dc11116 Mon Sep 17 00:00:00 2001
From: John Naylor <[email protected]>
Date: Wed, 11 Aug 2021 12:22:24 -0400
Subject: [PATCH v1 2/2] Replace intrinsics in pg_popcount*_slow with pure C
code
Intrinsics are used in the hope that the compiler will access some fast
hardware implementation where available. However, on x86 at least,
__builtin_popcount() didn't emit a POPCNT instruction since -mpopcnt
wasn't passed to the compiler. Instead, the compiler emitted bitwise
operations where the intrinsic was supported. Where not supported,
we used a byte-at-a-time loop using a lookup table.
Since the *slow functions are fallback implementations, replace the
intrinsics and the associated #ifdef maze with the bitwise operations
written in C so all platforms can benefit from them.
If we ever get configure support for x86-64-v2, we could use these
intrinsics to emit the POPCNT instruction without a runtime check. To
allow for that possibility, let's keep the configure checks around.
---
src/port/pg_bitutils.c | 47 ++++++++++++++----------------------------
1 file changed, 15 insertions(+), 32 deletions(-)
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 3e90de5249..500372db10 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -207,6 +207,11 @@ __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc");
#endif /* TRY_POPCNT_FAST */
+/*
+ * The *_slow implementations are based on
+ * http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+ */
+
/*
* pg_popcount32_slow
* Return the number of 1 bits set in word
@@ -214,19 +219,11 @@ __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc");
int
pg_popcount32_slow(uint32 word)
{
-#ifdef HAVE__BUILTIN_POPCOUNT
- return __builtin_popcount(word);
-#else /* !HAVE__BUILTIN_POPCOUNT */
- int result = 0;
-
- while (word != 0)
- {
- result += pg_number_of_ones[word & 255];
- word >>= 8;
- }
-
- return result;
-#endif /* HAVE__BUILTIN_POPCOUNT */
+ word = word - ((word >> 1) & 0x55555555);
+ word = (word & 0x33333333) +
+ ((word >> 2) & 0x33333333);
+ word = (word + (word >> 4)) & 0xF0F0F0F;
+ return (int) ((word * 0x1010101) >> 24);
}
/*
@@ -236,25 +233,11 @@ pg_popcount32_slow(uint32 word)
int
pg_popcount64_slow(uint64 word)
{
-#ifdef HAVE__BUILTIN_POPCOUNT
-#if defined(HAVE_LONG_INT_64)
- return __builtin_popcountl(word);
-#elif defined(HAVE_LONG_LONG_INT_64)
- return __builtin_popcountll(word);
-#else
-#error must have a working 64-bit integer datatype
-#endif
-#else /* !HAVE__BUILTIN_POPCOUNT */
- int result = 0;
-
- while (word != 0)
- {
- result += pg_number_of_ones[word & 255];
- word >>= 8;
- }
-
- return result;
-#endif /* HAVE__BUILTIN_POPCOUNT */
+ word = word - ((word >> 1) & UINT64CONST(0x5555555555555555));
+ word = (word & UINT64CONST(0x3333333333333333)) +
+ ((word >> 2) & UINT64CONST(0x3333333333333333));
+ word = (word + (word >> 4)) & UINT64CONST(0xF0F0F0F0F0F0F0F);
+ return (int) ((word * UINT64CONST(0x101010101010101)) >> 56);
}
--
2.31.1