From 077f88bcbff74d29b64459fcdac3096a28d07b72 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Thu, 26 Dec 2019 18:28:50 -0500
Subject: [PATCH] Use the CLZ instruction in AllocSetFreeIndex()

In commit ab5b4e2f9ed, we optimized AllocSetFreeIndex() using a lookup
table. At the time, using CLZ was rejected because compiler/platform
support was not widespread enough to justify it. Since 02a6a54ecd6,
we test for availablity of __builtin_clz(), so use that instead. This
is about 20% faster on Intel platforms, but perhaps more importantly
reduces cache pollution caused by the lookup table approach.

In addition, for the open-coded case, use the general-purpose lookup
table added by 02a6a54ecd6, rather than a single-purpose one. This
allows platforms without CLZ to reduce cache pollution as well.
---
 src/backend/utils/mmgr/aset.c | 35 ++++++++++++++---------------------
 1 file changed, 14 insertions(+), 21 deletions(-)

diff --git a/src/backend/utils/mmgr/aset.c b/src/backend/utils/mmgr/aset.c
index f729d9b6de..137c0b8ee5 100644
--- a/src/backend/utils/mmgr/aset.c
+++ b/src/backend/utils/mmgr/aset.c
@@ -46,6 +46,7 @@
 
 #include "postgres.h"
 
+#include "port/pg_bitutils.h"
 #include "utils/memdebug.h"
 #include "utils/memutils.h"
 
@@ -297,18 +298,6 @@ static const MemoryContextMethods AllocSetMethods = {
 #endif
 };
 
-/*
- * Table for AllocSetFreeIndex
- */
-#define LT16(n) n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n
-
-static const unsigned char LogTable256[256] =
-{
-	0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
-	LT16(5), LT16(6), LT16(6), LT16(7), LT16(7), LT16(7), LT16(7),
-	LT16(8), LT16(8), LT16(8), LT16(8), LT16(8), LT16(8), LT16(8), LT16(8)
-};
-
 /* ----------
  * Debug macros
  * ----------
@@ -337,8 +326,7 @@ static inline int
 AllocSetFreeIndex(Size size)
 {
 	int			idx;
-	unsigned int t,
-				tsize;
+	unsigned int tsize;
 
 	if (size > (1 << ALLOC_MINBITS))
 	{
@@ -346,15 +334,20 @@ AllocSetFreeIndex(Size size)
 
 		/*
 		 * At this point we need to obtain log2(tsize)+1, ie, the number of
-		 * not-all-zero bits at the right.  We used to do this with a
-		 * shift-and-count loop, but this function is enough of a hotspot to
-		 * justify micro-optimization effort.  The best approach seems to be
-		 * to use a lookup table.  Note that this code assumes that
-		 * ALLOCSET_NUM_FREELISTS <= 17, since we only cope with two bytes of
-		 * the tsize value.
+		 * not-all-zero bits at the right.  We don't use the utility function
+		 * pg_leftmost_one_pos32() here because if CLZ is not available,
+		 * determining the correct shift has a performance penalty.
+		 * By assuming that ALLOCSET_NUM_FREELISTS <= 17, we only need to
+		 * cope with two bytes of the tsize value.
 		 */
+#ifdef HAVE__BUILTIN_CLZ
+		idx = 32 - __builtin_clz((uint32) tsize);
+#else
+		unsigned int t;
 		t = tsize >> 8;
-		idx = t ? LogTable256[t] + 8 : LogTable256[tsize];
+		idx = t ? pg_leftmost_one_pos[t] + 8 : pg_leftmost_one_pos[tsize];
+		idx += 1;
+#endif
 
 		Assert(idx < ALLOCSET_NUM_FREELISTS);
 	}
-- 
2.22.0

