On 2019-Feb-14, Tom Lane wrote:
> I'd bet a fair amount of money that we'd be better off *not* using
> lzcnt, even if available, because then we could just expose things
> along this line:
>
> static inline int
> pg_clz(...)
> {
> #ifdef HAVE__BUILTIN_CLZ
> return __builtin_clz(x);
> #else
> handwritten implementation;
> #endif
> }
>
> Avoiding a function call (that has to indirect through a pointer) probably
> saves much more than the difference between lzcnt and the other way.
I see ... makes sense.
That leads me to the attached patch. It creates a new file
pg_popcount.c which is the only one compiled with -mpopcnt (if
available); if there's no compiler switch to enable POPCNT, we just
don't compile the file. I'm not sure that's kosher -- in particular I'm
not sure if it can fail when POPCNT is enabled by other flags and
-mpopcnt is not needed at all. I think our c-compiler.m4 stuff is a bit
too simplistic there: it just assumes that -mpopcnt is always required.
But what if the user passes it in CFLAGS?
I left CPUID alone for the CLZ/CTZ builtins. So we either use the
table, or the builtins. We never try the instructions.
--
Álvaro Herrera https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
commit 6c771a4f43da0409ae5fa9ff1b1579f381c451c0
Author: Alvaro Herrera <[email protected]>
AuthorDate: Thu Feb 14 15:18:03 2019 -0300
CommitDate: Thu Feb 14 19:24:03 2019 -0300
fix popcount etc
diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 148c5550573..635eb9331af 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -10,17 +10,20 @@
*
*------------------------------------------------------------------------ -
*/
-
#ifndef PG_BITUTILS_H
#define PG_BITUTILS_H
-extern int (*pg_popcount32) (uint32 word);
-extern int (*pg_popcount64) (uint64 word);
-extern int (*pg_rightmost_one32) (uint32 word);
-extern int (*pg_rightmost_one64) (uint64 word);
-extern int (*pg_leftmost_one32) (uint32 word);
-extern int (*pg_leftmost_one64) (uint64 word);
+extern int pg_rightmost_one32(uint32 word);
+extern int pg_rightmost_one64(uint64 word);
+extern int pg_leftmost_one32(uint32 word);
+extern int pg_leftmost_one64(uint64 word);
+extern int (*pg_popcount32) (uint32 word);
+extern int (*pg_popcount64) (uint64 word);
extern uint64 pg_popcount(const char *buf, int bytes);
+/* in pg_popcount.c */
+extern int pg_popcount32_sse42(uint32 word);
+extern int pg_popcount64_sse42(uint64 word);
+
#endif /* PG_BITUTILS_H */
diff --git a/src/port/Makefile b/src/port/Makefile
index 2da73260a13..d7290573c65 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -41,6 +41,13 @@ OBJS = $(LIBOBJS) $(PG_CRC32C_OBJS) chklocale.o erand48.o inet_net_ntop.o \
qsort.o qsort_arg.o quotes.o snprintf.o sprompt.o strerror.o \
tar.o thread.o
+# If the compiler supports a flag for the POPCOUNT instruction, we compile
+# pg_popcount.o with it. (Whether to actually use the functions therein is
+# determined at runtime by testing CPUID flags.)
+ifneq ($(CFLAGS_POPCNT),)
+OBJS += pg_popcount.o
+endif
+
# libpgport.a, libpgport_shlib.a, and libpgport_srv.a contain the same files
# foo.o, foo_shlib.o, and foo_srv.o are all built from foo.c
OBJS_SHLIB = $(OBJS:%.o=%_shlib.o)
@@ -78,10 +85,10 @@ pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_ARMV8_CRC32C)
pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_ARMV8_CRC32C)
pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_ARMV8_CRC32C)
-# pg_bitutils.c needs CFLAGS_POPCNT
-pg_bitutils.o: CFLAGS+=$(CFLAGS_POPCNT)
-pg_bitutils_shlib.o: CFLAGS+=$(CFLAGS_POPCNT)
-pg_bitutils_srv.o: CFLAGS+=$(CFLAGS_POPCNT)
+# pg_popcount.c needs CFLAGS_POPCNT
+pg_popcount.o: CFLAGS+=$(CFLAGS_POPCNT)
+pg_popcount_shlib.o: CFLAGS+=$(CFLAGS_POPCNT)
+pg_popcount_srv.o: CFLAGS+=$(CFLAGS_POPCNT)
#
# Shared library versions of object files
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index aac394fe927..23d317d111e 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -10,7 +10,6 @@
*
*-------------------------------------------------------------------------
*/
-
#include "postgres.h"
#ifdef HAVE__GET_CPUID
@@ -23,61 +22,21 @@
#include "port/pg_bitutils.h"
-#if defined(HAVE__BUILTIN_POPCOUNT) && defined(HAVE__GET_CPUID)
+#ifdef HAVE__BUILTIN_POPCOUNT
static bool pg_popcount_available(void);
-static int pg_popcount32_choose(uint32 word);
-static int pg_popcount32_sse42(uint32 word);
-static int pg_popcount64_choose(uint64 word);
-static int pg_popcount64_sse42(uint64 word);
-#endif
-static int pg_popcount32_slow(uint32 word);
-static int pg_popcount64_slow(uint64 word);
-
-#if defined(HAVE__GET_CPUID) && (defined(HAVE__BUILTIN_CTZ) || defined(HAVE__BUILTIN_CLZ))
-static bool pg_lzcnt_available(void);
-#endif
-
-#if defined(HAVE__BUILTIN_CTZ) && defined(HAVE__GET_CPUID)
-static int pg_rightmost_one32_choose(uint32 word);
-static int pg_rightmost_one32_abm(uint32 word);
-static int pg_rightmost_one64_choose(uint64 word);
-static int pg_rightmost_one64_abm(uint64 word);
-#endif
-static int pg_rightmost_one32_slow(uint32 word);
-static int pg_rightmost_one64_slow(uint64 word);
-
-#if defined(HAVE__BUILTIN_CLZ) && defined(HAVE__GET_CPUID)
-static int pg_leftmost_one32_choose(uint32 word);
-static int pg_leftmost_one32_abm(uint32 word);
-static int pg_leftmost_one64_choose(uint64 word);
-static int pg_leftmost_one64_abm(uint64 word);
-#endif
-static int pg_leftmost_one32_slow(uint32 word);
-static int pg_leftmost_one64_slow(uint64 word);
-
-#if defined(HAVE__BUILTIN_POPCOUNT) && defined(HAVE__GET_CPUID)
-int (*pg_popcount32) (uint32 word) = pg_popcount32_choose;
-int (*pg_popcount64) (uint64 word) = pg_popcount64_choose;
+static int pg_popcount32_choose(uint32 word);
+static int pg_popcount32_builtin(uint32 word);
+static int pg_popcount64_choose(uint64 word);
+static int pg_popcount64_builtin(uint64 word);
+int (*pg_popcount32) (uint32 word) = pg_popcount32_choose;
+int (*pg_popcount64) (uint64 word) = pg_popcount64_choose;
#else
-int (*pg_popcount32) (uint32 word) = pg_popcount32_slow;
-int (*pg_popcount64) (uint64 word) = pg_popcount64_slow;
-#endif
+static int pg_popcount32_slow(uint32 word);
+static int pg_popcount64_slow(uint64 word);
+int (*pg_popcount32) (uint32 word) = pg_popcount32_slow;
+int (*pg_popcount64) (uint64 word) = pg_popcount64_slow;
+#endif /* !HAVE_BUILTIN_POPCOUNT */
-#if defined(HAVE__BUILTIN_CTZ) && defined(HAVE__GET_CPUID)
-int (*pg_rightmost_one32) (uint32 word) = pg_rightmost_one32_choose;
-int (*pg_rightmost_one64) (uint64 word) = pg_rightmost_one64_choose;
-#else
-int (*pg_rightmost_one32) (uint32 word) = pg_rightmost_one32_slow;
-int (*pg_rightmost_one64) (uint64 word) = pg_rightmost_one64_slow;
-#endif
-
-#if defined(HAVE__BUILTIN_CLZ) && defined(HAVE__GET_CPUID)
-int (*pg_leftmost_one32) (uint32 word) = pg_leftmost_one32_choose;
-int (*pg_leftmost_one64) (uint64 word) = pg_leftmost_one64_choose;
-#else
-int (*pg_leftmost_one32) (uint32 word) = pg_leftmost_one32_slow;
-int (*pg_leftmost_one64) (uint64 word) = pg_leftmost_one64_slow;
-#endif
/* Array marking the number of 1-bits for each value of 0-255. */
static const uint8 number_of_ones[256] = {
@@ -99,6 +58,7 @@ static const uint8 number_of_ones[256] = {
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
};
+#ifndef HAVE__BUILTIN_CTZ
/*
* Array marking the position of the right-most set bit for each value of
* 1-255. We count the right-most position as the 0th bit, and the
@@ -122,7 +82,9 @@ static const uint8 rightmost_one_pos[256] = {
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
};
+#endif /* !HAVE__BUILTIN_CTZ */
+#ifndef HAVE__BUILTIN_CLZ
/*
* Array marking the position of the left-most set bit for each value of
* 1-255. We count the right-most position as the 0th bit, and the
@@ -146,31 +108,36 @@ static const uint8 leftmost_one_pos[256] = {
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
};
+#endif /* !HAVE_BUILTIN_CLZ */
-#if defined(HAVE__GET_CPUID) && defined(HAVE__BUILTIN_POPCOUNT)
-
+/*
+ * Return true iff we have CPUID support and it indicates that the POPCNT
+ * instruction is available.
+ */
static bool
pg_popcount_available(void)
{
- unsigned int exx[4] = { 0, 0, 0, 0 };
+#if defined(HAVE__GET_CPUID) || defined(HAVE__CPUID)
+ unsigned int exx[4] = {0, 0, 0, 0};
#if defined(HAVE__GET_CPUID)
__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
#elif defined(HAVE__CPUID)
__cpuid(exx, 1);
-#else
-#error cpuid instruction not available
#endif
return (exx[2] & (1 << 23)) != 0; /* POPCNT */
-}
+#else /* HAVE__GET_CPUID || HAVE__CPUID */
+
+ return false;
#endif
+}
-#if defined(HAVE__GET_CPUID) && defined(HAVE__BUILTIN_POPCOUNT)
-
+#ifdef HAVE__BUILTIN_POPCOUNT
/*
- * This gets called on the first call. It replaces the function pointer
- * so that subsequent calls are routed directly to the chosen implementation.
+ * This gets called on the first call to pg_popcount32. It replaces the
+ * function pointer so that subsequent calls are routed directly to the chosen
+ * implementation.
*/
static int
pg_popcount32_choose(uint32 word)
@@ -178,18 +145,17 @@ pg_popcount32_choose(uint32 word)
if (pg_popcount_available())
pg_popcount32 = pg_popcount32_sse42;
else
- pg_popcount32 = pg_popcount32_slow;
+ pg_popcount32 = pg_popcount32_builtin;
return pg_popcount32(word);
}
static int
-pg_popcount32_sse42(uint32 word)
+pg_popcount32_builtin(uint32 word)
{
return __builtin_popcount(word);
}
-#endif
-
+#else /* HAVE__BUILTIN_POPCOUNT */
/*
* pg_popcount32_slow
* Return the number of 1 bits set in word
@@ -197,7 +163,7 @@ pg_popcount32_sse42(uint32 word)
static int
pg_popcount32_slow(uint32 word)
{
- int result = 0;
+ int result = 0;
while (word != 0)
{
@@ -207,6 +173,7 @@ pg_popcount32_slow(uint32 word)
return result;
}
+#endif
/*
* pg_popcount
@@ -215,13 +182,13 @@ pg_popcount32_slow(uint32 word)
uint64
pg_popcount(const char *buf, int bytes)
{
- uint64 popcnt = 0;
+ uint64 popcnt = 0;
#if SIZEOF_VOID_P >= 8
/* Process in 64-bit chunks if the buffer is aligned. */
if (buf == (char *) TYPEALIGN(8, buf))
{
- uint64 *words = (uint64 *) buf;
+ uint64 *words = (uint64 *) buf;
while (bytes >= 8)
{
@@ -235,7 +202,7 @@ pg_popcount(const char *buf, int bytes)
/* Process in 32-bit chunks if the buffer is aligned. */
if (buf == (char *) TYPEALIGN(4, buf))
{
- uint32 *words = (uint32 *) buf;
+ uint32 *words = (uint32 *) buf;
while (bytes >= 4)
{
@@ -254,11 +221,11 @@ pg_popcount(const char *buf, int bytes)
return popcnt;
}
-#if defined(HAVE__GET_CPUID) && defined(HAVE__BUILTIN_POPCOUNT)
-
+#ifdef HAVE__BUILTIN_POPCOUNT
/*
- * This gets called on the first call. It replaces the function pointer
- * so that subsequent calls are routed directly to the chosen implementation.
+ * This gets called on the first call to pg_popcount64. It replaces the
+ * function pointer so that subsequent calls are routed directly to the chosen
+ * implementation.
*/
static int
pg_popcount64_choose(uint64 word)
@@ -266,26 +233,24 @@ pg_popcount64_choose(uint64 word)
if (pg_popcount_available())
pg_popcount64 = pg_popcount64_sse42;
else
- pg_popcount64 = pg_popcount64_slow;
+ pg_popcount64 = pg_popcount64_builtin;
return pg_popcount64(word);
}
static int
-pg_popcount64_sse42(uint64 word)
+pg_popcount64_builtin(uint64 word)
{
#if defined(HAVE_LONG_INT_64)
return __builtin_popcountl(word);
#elif defined(HAVE_LONG_LONG_INT_64)
return __builtin_popcountll(word);
#else
- /* shouldn't happen */
#error must have a working 64-bit integer datatype
#endif
}
-#endif
-
+#else /* HAVE__BUILTIN_POPCOUNT */
/*
* pg_popcount64_slow
* Return the number of 1 bits set in word
@@ -293,7 +258,7 @@ pg_popcount64_sse42(uint64 word)
static int
pg_popcount64_slow(uint64 word)
{
- int result = 0;
+ int result = 0;
while (word != 0)
{
@@ -303,156 +268,77 @@ pg_popcount64_slow(uint64 word)
return result;
}
-
-#if defined(HAVE__GET_CPUID) && (defined(HAVE__BUILTIN_CTZ) || defined(HAVE__BUILTIN_CLZ))
-
-static bool
-pg_lzcnt_available(void)
-{
-
- unsigned int exx[4] = { 0, 0, 0, 0 };
-
-#if defined(HAVE__GET_CPUID)
- __get_cpuid(0x80000001, &exx[0], &exx[1], &exx[2], &exx[3]);
-#elif defined(HAVE__CPUID)
- __cpuid(exx, 0x80000001);
-#else
-#error cpuid instruction not available
-#endif
-
- return (exx[2] & (1 << 5)) != 0; /* LZCNT */
-}
-#endif
-
-#if defined(HAVE__GET_CPUID) && defined(HAVE__BUILTIN_CTZ)
-/*
- * This gets called on the first call. It replaces the function pointer
- * so that subsequent calls are routed directly to the chosen implementation.
- */
-static int
-pg_rightmost_one32_choose(uint32 word)
-{
- if (pg_lzcnt_available())
- pg_rightmost_one32 = pg_rightmost_one32_abm;
- else
- pg_rightmost_one32 = pg_rightmost_one32_slow;
-
- return pg_rightmost_one32(word);
-}
-
-static int
-pg_rightmost_one32_abm(uint32 word)
-{
- return __builtin_ctz(word);
-}
-
#endif
/*
- * pg_rightmost_one32_slow
+ * pg_rightmost_one32
* Returns the number of trailing 0-bits in word, starting at the least
* significant bit position. word must not be 0.
*/
-static int
-pg_rightmost_one32_slow(uint32 word)
+int
+pg_rightmost_one32(uint32 word)
{
- int result = 0;
+ int result = 0;
Assert(word != 0);
+#ifdef HAVE__BUILTIN_CTZ
+ result = __builtin_ctz(word);
+#else
while ((word & 255) == 0)
{
word >>= 8;
result += 8;
}
result += rightmost_one_pos[word & 255];
+#endif /* HAVE__BUILTIN_CTZ */
return result;
}
-#if defined(HAVE__GET_CPUID) && defined(HAVE__BUILTIN_CTZ)
/*
- * This gets called on the first call. It replaces the function pointer
- * so that subsequent calls are routed directly to the chosen implementation.
+ * pg_rightmost_one64
+ * Returns the number of trailing 0-bits in word, starting at the least
+ * significant bit position. word must not be 0.
*/
-static int
-pg_rightmost_one64_choose(uint64 word)
+int
+pg_rightmost_one64(uint64 word)
{
- if (pg_lzcnt_available())
- pg_rightmost_one64 = pg_rightmost_one64_abm;
- else
- pg_rightmost_one64 = pg_rightmost_one64_slow;
+ int result = 0;
- return pg_rightmost_one64(word);
-}
+ Assert(word != 0);
-static int
-pg_rightmost_one64_abm(uint64 word)
-{
+#ifdef HAVE__BUILTIN_CTZ
#if defined(HAVE_LONG_INT_64)
return __builtin_ctzl(word);
#elif defined(HAVE_LONG_LONG_INT_64)
return __builtin_ctzll(word);
#else
- /* shouldn't happen */
#error must have a working 64-bit integer datatype
#endif
-}
-#endif
-
-/*
- * pg_rightmost_one64_slow
- * Returns the number of trailing 0-bits in word, starting at the least
- * significant bit position. word must not be 0.
- */
-static int
-pg_rightmost_one64_slow(uint64 word)
-{
- int result = 0;
-
- Assert(word != 0);
-
+#else /* HAVE__BUILTIN_CTZ */
while ((word & 255) == 0)
{
word >>= 8;
result += 8;
}
result += rightmost_one_pos[word & 255];
+#endif
return result;
}
-#if defined(HAVE__GET_CPUID) && defined(HAVE__BUILTIN_CLZ)
/*
- * This gets called on the first call. It replaces the function pointer
- * so that subsequent calls are routed directly to the chosen implementation.
- */
-static int
-pg_leftmost_one32_choose(uint32 word)
-{
- if (pg_lzcnt_available())
- pg_leftmost_one32 = pg_leftmost_one32_abm;
- else
- pg_leftmost_one32 = pg_leftmost_one32_slow;
-
- return pg_leftmost_one32(word);
-}
-
-static int
-pg_leftmost_one32_abm(uint32 word)
-{
- return 31 - __builtin_clz(word);
-}
-#endif
-
-/*
- * pg_leftmost_one32_slow
+ * pg_leftmost_one32
* Returns the 0-based position of the most significant set bit in word
* measured from the least significant bit. word must not be 0.
*/
-static int
-pg_leftmost_one32_slow(uint32 word)
+int
+pg_leftmost_one32(uint32 word)
{
+#ifdef HAVE__BUILTIN_CLZ
+ return 31 - __builtin_clz(word);
+#else
int shift = 32 - 8;
Assert(word != 0);
@@ -461,53 +347,32 @@ pg_leftmost_one32_slow(uint32 word)
shift -= 8;
return shift + leftmost_one_pos[(word >> shift) & 255];
+#endif /* HAVE__BUILTIN_CLZ */
}
-#if defined(HAVE__GET_CPUID) && defined(HAVE__BUILTIN_CLZ)
/*
- * This gets called on the first call. It replaces the function pointer
- * so that subsequent calls are routed directly to the chosen implementation.
+ * pg_leftmost_one64
+ * Returns the 0-based position of the most significant set bit in word
+ * measured from the least significant bit. word must not be 0.
*/
-static int
-pg_leftmost_one64_choose(uint64 word)
-{
- if (pg_lzcnt_available())
- pg_leftmost_one64 = pg_leftmost_one64_abm;
- else
- pg_leftmost_one64 = pg_leftmost_one64_slow;
-
- return pg_leftmost_one64(word);
-}
-
-static int
-pg_leftmost_one64_abm(uint64 word)
+int
+pg_leftmost_one64(uint64 word)
{
+#ifdef HAVE__BUILTIN_CLZ
#if defined(HAVE_LONG_INT_64)
return 63 - __builtin_clzl(word);
#elif defined(HAVE_LONG_LONG_INT_64)
return 63 - __builtin_clzll(word);
#else
- /* shouldn't happen */
#error must have a working 64-bit integer datatype
#endif
-
-}
-#endif
-
-/*
- * pg_leftmost_one64_slow
- * Returns the 0-based position of the most significant set bit in word
- * measured from the least significant bit. word must not be 0.
- */
-static int
-pg_leftmost_one64_slow(uint64 word)
-{
+#else /* HAVE__BUILTIN_CLZ */
int shift = 64 - 8;
Assert(word != 0);
-
while ((word >> shift) == 0)
shift -= 8;
return shift + leftmost_one_pos[(word >> shift) & 255];
+#endif /* !HAVE__BUIILTIN_CLZ */
}
diff --git a/src/port/pg_popcount.c b/src/port/pg_popcount.c
new file mode 100644
index 00000000000..5254c41273f
--- /dev/null
+++ b/src/port/pg_popcount.c
@@ -0,0 +1,36 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_popcount.c
+ * CPU-optimized implementation of pg_popcount
+ *
+ * This file must be compiled with a compiler-specific flag to enable the
+ * POPCOUNT instruction.
+ *
+ * Portions Copyright (c) 2019, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/port/pg_popcount.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "port/pg_bitutils.h"
+
+int
+pg_popcount32_sse42(uint32 word)
+{
+ return __builtin_popcount(word);
+}
+
+int
+pg_popcount64_sse42(uint64 word)
+{
+#if defined(HAVE_LONG_INT_64)
+ return __builtin_popcountl(word);
+#elif defined(HAVE_LONG_LONG_INT_64)
+ return __builtin_popcountll(word);
+#else
+#error must have a working 64-bit integer datatype
+#endif
+}