On Thu, 11 Mar 2021 at 04:25, John Naylor <john.nay...@enterprisedb.com> wrote: > Okay, so it's hard-coded in various functions in contrib modules. In that > case, let's just keep the existing coding for those. In fact, the comments > that got removed by your patch specifically say: /* Using the popcount > functions here isn't likely to win */ ...which your testing confirmed. The > new function should be used only for Gist and possibly Intarray, whose > default is 124. That means we can't get rid of hemdistsign(), but that's > fine.
The comment is there for all types. Since I get the performance better on all the types, I have kept the pg_xorcount() call for all these contrib modules. I understand that since for some types default siglen is small, we won't get benefit. But I think we should call pg_xorcount() for the benefit of non-default siglen case. Have replaced hemdistsign() by pg_xorcount() everywhere; but with that, the call looks a bit clumsy because of having to type-cast the parameters to const unsigned char *. I noticed that the cast to "unsigned char *" is required only when we use the value in the pg_number_of_ones array. Elsewhere we can safely use 'a' and 'b' as "char *". So I changed the pg_xorcount() parameters from unsigned char * to char *. > I think the way to go is a simplified version of your 0001 (not 0002), with > only a single function, for gist and intarray only, and a style that better > matches the surrounding code. If you look at my xor functions in the attached > text file, you'll get an idea of what it should look like. Note that it got > the above performance without ever trying to massage the pointer alignment. > I'm a bit uncomfortable with the fact that we can't rely on alignment, but > maybe there's a simple fix somewhere in the gist code. In the attached 0001-v3 patch, I have updated the code to match it with surrounding code; specifically the usage of *a++ rather than a[i]. Regarding the alignment changes... I have removed the code that handled the leading identically unaligned bytes, for lack of evidence that percentage of such cases is significant. Like I noted earlier, for the tsearch data I used, identically unaligned cases were only 6%. If I find scenarios where these cases can be significant after all and if we cannot do anything in the gist index code, then we might have to bring back the unaligned byte handling. I didn't get a chance to dig deeper into the gist index implementation to see why they are not always 8-byte aligned. I have kept the 0002 patch as-is. Due to significant *additional* speedup, over and above the 0001 improvement, I think the code re-arrangement done is worth it for non-x86 platforms. -Amit Khandekar
From c6ae575dd483b85cec6748c2e014d0f32565b4eb Mon Sep 17 00:00:00 2001 From: Amit Khandekar <amitdkhan...@gmail.com> Date: Fri, 19 Mar 2021 20:22:44 +0800 Subject: [PATCH 1/2] Speed up xor'ing of two gist index signatures for tsvectors In hemdistsign(), rather than using xor operator on char values, use it in 64-bit chunks. And since the chunks are 64-bit, use popcount64() on each of the chunks. I have checked that the two bitvector pointer arguments of hemdistsign() are not always 64-bit aligned. So do the 64-bit chunks only if both pointers are 8 byte-aligned. This results in speed-up in Gist index creation for tsvectors. With default siglen (124), the speed up is 12-20%. With siglen=700, it is 30-50%. So with longer signature lengths, we get higher percentage speed-up. Similar results are seen in other types using gist index, such as intarray, hstore, and ltree that are availale in contrib. With smaller siglens such as 10, 20 etc, there is a bit of a reduction in speed by 1-7% if we use this optimization. It's probably because of an extra function call for pg_xorcount(); and also might be due to the extra logic in pg_xorcount(). So for siglen less than 32, keep the existing method using byte-by-byte traversal. --- contrib/hstore/hstore_gist.c | 17 ++-------------- contrib/intarray/_intbig_gist.c | 18 +--------------- contrib/ltree/_ltree_gist.c | 19 ++--------------- src/backend/utils/adt/tsgistidx.c | 26 +++++------------------ src/include/port/pg_bitutils.h | 17 ++++++++++++++++ src/port/pg_bitutils.c | 34 +++++++++++++++++++++++++++++++ 6 files changed, 61 insertions(+), 70 deletions(-) diff --git a/contrib/hstore/hstore_gist.c b/contrib/hstore/hstore_gist.c index 102c9cea72..4970a0a2f0 100644 --- a/contrib/hstore/hstore_gist.c +++ b/contrib/hstore/hstore_gist.c @@ -8,6 +8,7 @@ #include "access/stratnum.h" #include "catalog/pg_type.h" #include "hstore.h" +#include "port/pg_bitutils.h" #include "utils/pg_crc.h" /* gist_hstore_ops opclass options */ @@ -256,20 +257,6 @@ sizebitvec(BITVECP sign, int siglen) return size; } -static int -hemdistsign(BITVECP a, BITVECP b, int siglen) -{ - int i, - dist = 0; - - LOOPBIT(siglen) - { - if (GETBIT(a, i) != GETBIT(b, i)) - dist++; - } - return dist; -} - static int hemdist(GISTTYPE *a, GISTTYPE *b, int siglen) { @@ -283,7 +270,7 @@ hemdist(GISTTYPE *a, GISTTYPE *b, int siglen) else if (ISALLTRUE(b)) return SIGLENBIT(siglen) - sizebitvec(GETSIGN(a), siglen); - return hemdistsign(GETSIGN(a), GETSIGN(b), siglen); + return pg_xorcount(GETSIGN(a), GETSIGN(b), siglen); } static int32 diff --git a/contrib/intarray/_intbig_gist.c b/contrib/intarray/_intbig_gist.c index 18ecd8cda6..8aa76042b0 100644 --- a/contrib/intarray/_intbig_gist.c +++ b/contrib/intarray/_intbig_gist.c @@ -211,22 +211,6 @@ sizebitvec(BITVECP sign, int siglen) return pg_popcount(sign, siglen); } -static int -hemdistsign(BITVECP a, BITVECP b, int siglen) -{ - int i, - diff, - dist = 0; - - LOOPBYTE(siglen) - { - diff = (unsigned char) (a[i] ^ b[i]); - /* Using the popcount functions here isn't likely to win */ - dist += pg_number_of_ones[diff]; - } - return dist; -} - static int hemdist(GISTTYPE *a, GISTTYPE *b, int siglen) { @@ -240,7 +224,7 @@ hemdist(GISTTYPE *a, GISTTYPE *b, int siglen) else if (ISALLTRUE(b)) return SIGLENBIT(siglen) - sizebitvec(GETSIGN(a), siglen); - return hemdistsign(GETSIGN(a), GETSIGN(b), siglen); + return pg_xorcount(GETSIGN(a), GETSIGN(b), siglen); } Datum diff --git a/contrib/ltree/_ltree_gist.c b/contrib/ltree/_ltree_gist.c index 72516c3b6b..85a840311e 100644 --- a/contrib/ltree/_ltree_gist.c +++ b/contrib/ltree/_ltree_gist.c @@ -180,22 +180,6 @@ sizebitvec(BITVECP sign, int siglen) return pg_popcount((const char *) sign, siglen); } -static int -hemdistsign(BITVECP a, BITVECP b, int siglen) -{ - int i, - diff, - dist = 0; - - ALOOPBYTE(siglen) - { - diff = (unsigned char) (a[i] ^ b[i]); - /* Using the popcount functions here isn't likely to win */ - dist += pg_number_of_ones[diff]; - } - return dist; -} - static int hemdist(ltree_gist *a, ltree_gist *b, int siglen) { @@ -209,7 +193,8 @@ hemdist(ltree_gist *a, ltree_gist *b, int siglen) else if (LTG_ISALLTRUE(b)) return ASIGLENBIT(siglen) - sizebitvec(LTG_SIGN(a), siglen); - return hemdistsign(LTG_SIGN(a), LTG_SIGN(b), siglen); + return pg_xorcount((const char *) LTG_SIGN(a), (const char *) LTG_SIGN(b), + siglen); } diff --git a/src/backend/utils/adt/tsgistidx.c b/src/backend/utils/adt/tsgistidx.c index c09eefdda2..1659bc2727 100644 --- a/src/backend/utils/adt/tsgistidx.c +++ b/src/backend/utils/adt/tsgistidx.c @@ -486,22 +486,6 @@ sizebitvec(BITVECP sign, int siglen) return pg_popcount(sign, siglen); } -static int -hemdistsign(BITVECP a, BITVECP b, int siglen) -{ - int i, - diff, - dist = 0; - - LOOPBYTE(siglen) - { - diff = (unsigned char) (a[i] ^ b[i]); - /* Using the popcount functions here isn't likely to win */ - dist += pg_number_of_ones[diff]; - } - return dist; -} - static int hemdist(SignTSVector *a, SignTSVector *b) { @@ -520,7 +504,7 @@ hemdist(SignTSVector *a, SignTSVector *b) Assert(siglena == siglenb); - return hemdistsign(GETSIGN(a), GETSIGN(b), siglena); + return pg_xorcount(GETSIGN(a), GETSIGN(b), siglena); } Datum @@ -551,7 +535,7 @@ gtsvector_penalty(PG_FUNCTION_ARGS) (float) (siglenbit + 1); } else - *penalty = hemdistsign(sign, orig, siglen); + *penalty = pg_xorcount(sign, orig, siglen); pfree(sign); } @@ -611,7 +595,7 @@ hemdistcache(CACHESIGN *a, CACHESIGN *b, int siglen) else if (b->allistrue) return SIGLENBIT(siglen) - sizebitvec(a->sign, siglen); - return hemdistsign(a->sign, b->sign, siglen); + return pg_xorcount(a->sign, b->sign, siglen); } Datum @@ -732,7 +716,7 @@ gtsvector_picksplit(PG_FUNCTION_ARGS) siglen); } else - size_alpha = hemdistsign(cache[j].sign, GETSIGN(datum_l), siglen); + size_alpha = pg_xorcount(cache[j].sign, GETSIGN(datum_l), siglen); if (ISALLTRUE(datum_r) || cache[j].allistrue) { @@ -746,7 +730,7 @@ gtsvector_picksplit(PG_FUNCTION_ARGS) siglen); } else - size_beta = hemdistsign(cache[j].sign, GETSIGN(datum_r), siglen); + size_beta = pg_xorcount(cache[j].sign, GETSIGN(datum_r), siglen); if (size_alpha < size_beta + WISH_F(v->spl_nleft, v->spl_nright, 0.1)) { diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index f9b77ec278..26c8c9e38e 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -214,6 +214,23 @@ extern int (*pg_popcount64) (uint64 word); /* Count the number of one-bits in a byte array */ extern uint64 pg_popcount(const char *buf, int bytes); +/* Count the number of 1-bits in the result of xor operation */ +extern uint64 pg_xorcount_long(const char *a, const char *b, int bytes); +static inline uint64 pg_xorcount(const char *a, const char *b, int bytes) +{ + /* For smaller lengths, do simple byte-by-byte traversal */ + if (bytes <= 32) + { + uint64 popcnt = 0; + + while (bytes--) + popcnt += pg_number_of_ones[(unsigned char) (*a++ ^ *b++)]; + return popcnt; + } + else + return pg_xorcount_long(a, b, bytes); +} + /* * Rotate the bits of "word" to the right by n bits. */ diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index 2252021854..41a44b17a3 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -319,3 +319,37 @@ pg_popcount(const char *buf, int bytes) return popcnt; } + +/* + * pg_xorcount + * Count the number of 1-bits in the result of xor operation. + */ +uint64 +pg_xorcount_long(const char *a, const char *b, int bytes) +{ + uint64 popcnt = 0; + +#if SIZEOF_VOID_P >= 8 + /* Process in 64-bit chunks if both are aligned. */ + if (PointerIsAligned(a, uint64) && PointerIsAligned(b, uint64)) + { + const uint64 *a_words = (const uint64 *) a; + const uint64 *b_words = (const uint64 *) b; + + while (bytes >= 8) + { + popcnt += pg_popcount64(*a_words++ ^ *b_words++); + bytes -= 8; + } + + a = (const char *) a_words; + b = (const char *) b_words; + } +#endif + + /* Process any remaining bytes */ + while (bytes--) + popcnt += pg_number_of_ones[(unsigned char) (*a++ ^ *b++)]; + + return popcnt; +} -- 2.17.1
From b8905b47f7e0af8d4558fdac73cc2283c263cbcc Mon Sep 17 00:00:00 2001 From: Amit Khandekar <amitdkhan...@gmail.com> Date: Thu, 10 Dec 2020 21:45:49 +0800 Subject: [PATCH 2/2] Avoid function pointer dereferencing for pg_popcount32/64() call. With USE_POPCNT_ASM set (which is true only on x86), we decide with the help of __get_cpuid() at runtime whether the CPU supports popcnt instruction, and hence we dynamically choose the corresponding function; thus pg_popcount32/64 is a function pointer. But for platforms with USE_POPCNT_ASM not set, we don't have to use dynamic assignment of functions, but we were still declaring pg_popcount64 as a function pointer. So arrange for direct function call so as to get rid of function pointer dereferencing each time pg_popcount32/64 is called. To do this, define pg_popcount64 to another function name (pg_popcount64_nonasm) rather than a function pointer, whenever USE_POPCNT_ASM is not defined. And let pg_popcount64_nonasm() be a static inline function so that whenever pg_popcount64() is called, the __builtin_popcount() gets called directly. For platforms not supporting __builtin_popcount(), continue using the slow version as is the current behaviour. pg_popcount64_slow() was actually a misnomer, because it was actually calling __builtin_popcount() which is not a slow function. Now with this commit, pg_popcount64_slow() indeed has only slow code. Tested this on ARM64, and the gist index creation for tsvectors speeds up by ~ 6% for default siglen (=124), and by 12% with siglen=700 --- src/include/port/pg_bitutils.h | 59 ++++++++++++++++++++++++++++++++++ src/port/pg_bitutils.c | 47 +++++---------------------- 2 files changed, 67 insertions(+), 39 deletions(-) diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index 174df28e66..b039f94ee5 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -23,6 +23,19 @@ extern const uint8 pg_rightmost_one_pos[256]; extern const uint8 pg_number_of_ones[256]; #endif +/* + * On x86_64, we can use the hardware popcount instruction, but only if + * we can verify that the CPU supports it via the cpuid instruction. + * + * Otherwise, we fall back to __builtin_popcount if the compiler has that, + * or a hand-rolled implementation if not. + */ +#ifdef HAVE_X86_64_POPCNTQ +#if defined(HAVE__GET_CPUID) || defined(HAVE__CPUID) +#define USE_POPCNT_ASM 1 +#endif +#endif + /* * pg_leftmost_one_pos32 * Returns the position of the most significant set bit in "word", @@ -208,8 +221,54 @@ pg_ceil_log2_64(uint64 num) } /* Count the number of one-bits in a uint32 or uint64 */ + +/* + * With USE_POPCNT_ASM set (which is true only on x86), we decide at runtime + * whether the CPU supports popcnt instruction, and hence we dynamically choose + * the corresponding function; thus pg_popcount32/64 is a function pointer. But + * for platforms with USE_POPCNT_ASM not set, we don't have to use dynamic + * assignment of functions, so we arrange for direct function call so as to get + * rid of function pointer dereferencing each time pg_popcount32/64 is called. + */ +#ifdef USE_POPCNT_ASM extern int (*pg_popcount32) (uint32 word); extern int (*pg_popcount64) (uint64 word); +#else +#define pg_popcount32 pg_popcount32_nonasm +#define pg_popcount64 pg_popcount64_nonasm +#endif + +/* Slow versions of pg_popcount */ +#ifndef HAVE__BUILTIN_POPCOUNT +extern int pg_popcount32_slow(uint32 word); +extern int pg_popcount64_slow(uint64 word); +#endif + +static inline int +pg_popcount64_nonasm(uint64 word) +{ +#ifdef HAVE__BUILTIN_POPCOUNT +#if defined(HAVE_LONG_INT_64) + return __builtin_popcountl(word); +#elif defined(HAVE_LONG_LONG_INT_64) + return __builtin_popcountll(word); +#else +#error must have a working 64-bit integer datatype +#endif +#else /* !HAVE__BUILTIN_POPCOUNT */ + return pg_popcount64_slow(word); +#endif /* HAVE__BUILTIN_POPCOUNT */ +} + +static inline int +pg_popcount32_nonasm(uint32 word) +{ +#ifdef HAVE__BUILTIN_POPCOUNT + return __builtin_popcount(word); +#else + return pg_popcount32_slow(word); +#endif /* HAVE__BUILTIN_POPCOUNT */ +} /* Count the number of one-bits in a byte array */ extern uint64 pg_popcount(const char *buf, int bytes); diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index cb2f5f5f0b..4a220590bf 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -103,22 +103,6 @@ const uint8 pg_number_of_ones[256] = { 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 }; -/* - * On x86_64, we can use the hardware popcount instruction, but only if - * we can verify that the CPU supports it via the cpuid instruction. - * - * Otherwise, we fall back to __builtin_popcount if the compiler has that, - * or a hand-rolled implementation if not. - */ -#ifdef HAVE_X86_64_POPCNTQ -#if defined(HAVE__GET_CPUID) || defined(HAVE__CPUID) -#define USE_POPCNT_ASM 1 -#endif -#endif - -static int pg_popcount32_slow(uint32 word); -static int pg_popcount64_slow(uint64 word); - #ifdef USE_POPCNT_ASM static bool pg_popcount_available(void); static int pg_popcount32_choose(uint32 word); @@ -128,9 +112,6 @@ static int pg_popcount64_asm(uint64 word); int (*pg_popcount32) (uint32 word) = pg_popcount32_choose; int (*pg_popcount64) (uint64 word) = pg_popcount64_choose; -#else -int (*pg_popcount32) (uint32 word) = pg_popcount32_slow; -int (*pg_popcount64) (uint64 word) = pg_popcount64_slow; #endif /* USE_POPCNT_ASM */ #ifdef USE_POPCNT_ASM @@ -170,8 +151,8 @@ pg_popcount32_choose(uint32 word) } else { - pg_popcount32 = pg_popcount32_slow; - pg_popcount64 = pg_popcount64_slow; + pg_popcount32 = pg_popcount32_nonasm; + pg_popcount64 = pg_popcount64_nonasm; } return pg_popcount32(word); @@ -187,8 +168,8 @@ pg_popcount64_choose(uint64 word) } else { - pg_popcount32 = pg_popcount32_slow; - pg_popcount64 = pg_popcount64_slow; + pg_popcount32 = pg_popcount32_nonasm; + pg_popcount64 = pg_popcount64_nonasm; } return pg_popcount64(word); @@ -223,16 +204,14 @@ __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc"); #endif /* USE_POPCNT_ASM */ +#ifndef HAVE__BUILTIN_POPCOUNT /* * pg_popcount32_slow * Return the number of 1 bits set in word */ -static int +int pg_popcount32_slow(uint32 word) { -#ifdef HAVE__BUILTIN_POPCOUNT - return __builtin_popcount(word); -#else /* !HAVE__BUILTIN_POPCOUNT */ int result = 0; while (word != 0) @@ -242,25 +221,15 @@ pg_popcount32_slow(uint32 word) } return result; -#endif /* HAVE__BUILTIN_POPCOUNT */ } /* * pg_popcount64_slow * Return the number of 1 bits set in word */ -static int +int pg_popcount64_slow(uint64 word) { -#ifdef HAVE__BUILTIN_POPCOUNT -#if defined(HAVE_LONG_INT_64) - return __builtin_popcountl(word); -#elif defined(HAVE_LONG_LONG_INT_64) - return __builtin_popcountll(word); -#else -#error must have a working 64-bit integer datatype -#endif -#else /* !HAVE__BUILTIN_POPCOUNT */ int result = 0; while (word != 0) @@ -270,8 +239,8 @@ pg_popcount64_slow(uint64 word) } return result; -#endif /* HAVE__BUILTIN_POPCOUNT */ } +#endif /* HAVE__BUILTIN_POPCOUNT */ /* -- 2.17.1