diff --git a/configure b/configure
index ec75318..2e79190 100755
--- a/configure
+++ b/configure
@@ -11937,6 +11937,39 @@ $as_echo "#define LOCALE_T_IN_XLOCALE 1" >>confdefs.h
 
 fi
 
+# Check for __builtin_popcount
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_popcount" >&5
+$as_echo_n "checking for __builtin_popcount... " >&6; }
+if ${pgac_cv_have_builtin_popcount+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+return __builtin_popcount(42)
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_have_builtin_popcount="yes"
+else
+  pgac_cv_have_builtin_popcount="no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_have_builtin_popcount" >&5
+$as_echo "$pgac_cv_have_builtin_popcount" >&6; }
+if test "$pgac_cv_have_builtin_popcount" = "yes" ; then
+
+$as_echo "#define HAVE_BUILTIN_POPCOUNT 1" >>confdefs.h
+
+fi
+
 ac_fn_c_check_type "$LINENO" "struct cmsgcred" "ac_cv_type_struct_cmsgcred" "#include <sys/socket.h>
 #include <sys/param.h>
 #ifdef HAVE_SYS_UCRED_H
diff --git a/configure.in b/configure.in
index bc925d0..4d50f6c 100644
--- a/configure.in
+++ b/configure.in
@@ -1371,6 +1371,15 @@ AC_TYPE_LONG_LONG_INT
 
 PGAC_TYPE_LOCALE_T
 
+# Check for __builtin_popcount
+AC_CACHE_CHECK([for __builtin_popcount], [pgac_cv_have_builtin_popcount],
+ [AC_LINK_IFELSE([AC_LANG_PROGRAM([], [[return __builtin_popcount(42)]])],
+				 [pgac_cv_have_builtin_popcount="yes"],
+				 [pgac_cv_have_builtin_popcount="no"])])
+if test "$pgac_cv_have_builtin_popcount" = "yes" ; then
+	AC_DEFINE(HAVE_BUILTIN_POPCOUNT, 1, [Define to 1 if you have __builtin_popcount.])
+fi
+
 AC_CHECK_TYPES([struct cmsgcred], [], [],
 [#include <sys/socket.h>
 #include <sys/param.h>
diff --git a/contrib/intarray/_intbig_gist.c b/contrib/intarray/_intbig_gist.c
index 6dae7c91..7d6127d 100644
--- a/contrib/intarray/_intbig_gist.c
+++ b/contrib/intarray/_intbig_gist.c
@@ -5,6 +5,7 @@
 
 #include "access/gist.h"
 #include "access/stratnum.h"
+#include "common/bitutils.h"
 
 #include "_int.h"
 
@@ -19,27 +20,6 @@ PG_FUNCTION_INFO_V1(g_intbig_penalty);
 PG_FUNCTION_INFO_V1(g_intbig_picksplit);
 PG_FUNCTION_INFO_V1(g_intbig_union);
 PG_FUNCTION_INFO_V1(g_intbig_same);
-
-/* Number of one-bits in an unsigned byte */
-static const uint8 number_of_ones[256] = {
-	0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
-	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-	4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
-};
-
 PG_FUNCTION_INFO_V1(_intbig_in);
 PG_FUNCTION_INFO_V1(_intbig_out);
 
@@ -207,12 +187,7 @@ g_intbig_compress(PG_FUNCTION_ARGS)
 static int32
 sizebitvec(BITVECP sign)
 {
-	int32		size = 0,
-				i;
-
-	LOOPBYTE
-		size += number_of_ones[(unsigned char) sign[i]];
-	return size;
+	return popcount(&sign[0], SIGLEN * sizeof(sign[0]));
 }
 
 static int
@@ -225,7 +200,7 @@ hemdistsign(BITVECP a, BITVECP b)
 	LOOPBYTE
 	{
 		diff = (unsigned char) (a[i] ^ b[i]);
-		dist += number_of_ones[diff];
+		dist += popcount_uint8(diff);
 	}
 	return dist;
 }
diff --git a/contrib/ltree/_ltree_gist.c b/contrib/ltree/_ltree_gist.c
index a387f5b..be433d0 100644
--- a/contrib/ltree/_ltree_gist.c
+++ b/contrib/ltree/_ltree_gist.c
@@ -9,6 +9,7 @@
 
 #include "access/gist.h"
 #include "access/stratnum.h"
+#include "common/bitutils.h"
 #include "crc32.h"
 #include "ltree.h"
 
@@ -22,27 +23,6 @@ PG_FUNCTION_INFO_V1(_ltree_consistent);
 
 #define GETENTRY(vec,pos) ((ltree_gist *) DatumGetPointer((vec)->vector[(pos)].key))
 #define NEXTVAL(x) ( (ltree*)( (char*)(x) + INTALIGN( VARSIZE(x) ) ) )
-
-/* Number of one-bits in an unsigned byte */
-static const uint8 number_of_ones[256] = {
-	0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
-	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-	4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
-};
-
 #define WISH_F(a,b,c) (double)( -(double)(((a)-(b))*((a)-(b))*((a)-(b)))*(c) )
 
 
@@ -209,12 +189,7 @@ _ltree_union(PG_FUNCTION_ARGS)
 static int32
 sizebitvec(BITVECP sign)
 {
-	int32		size = 0,
-				i;
-
-	ALOOPBYTE
-		size += number_of_ones[(unsigned char) sign[i]];
-	return size;
+	return popcount(&sign[0], SIGLEN * sizeof(sign[0]));
 }
 
 static int
@@ -227,7 +202,7 @@ hemdistsign(BITVECP a, BITVECP b)
 	ALOOPBYTE
 	{
 		diff = (unsigned char) (a[i] ^ b[i]);
-		dist += number_of_ones[diff];
+		dist += popcount_uint8(diff);
 	}
 	return dist;
 }
diff --git a/contrib/pg_trgm/trgm_gist.c b/contrib/pg_trgm/trgm_gist.c
index 2181c26..fba0f15 100644
--- a/contrib/pg_trgm/trgm_gist.c
+++ b/contrib/pg_trgm/trgm_gist.c
@@ -6,6 +6,7 @@
 #include "trgm.h"
 
 #include "access/stratnum.h"
+#include "common/bitutils.h"
 #include "fmgr.h"
 
 
@@ -39,27 +40,6 @@ PG_FUNCTION_INFO_V1(gtrgm_same);
 PG_FUNCTION_INFO_V1(gtrgm_penalty);
 PG_FUNCTION_INFO_V1(gtrgm_picksplit);
 
-/* Number of one-bits in an unsigned byte */
-static const uint8 number_of_ones[256] = {
-	0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
-	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-	4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
-};
-
-
 Datum
 gtrgm_in(PG_FUNCTION_ARGS)
 {
@@ -629,12 +609,7 @@ gtrgm_same(PG_FUNCTION_ARGS)
 static int32
 sizebitvec(BITVECP sign)
 {
-	int32		size = 0,
-				i;
-
-	LOOPBYTE
-		size += number_of_ones[(unsigned char) sign[i]];
-	return size;
+	return popcount(&sign[0], SIGLEN * sizeof(sign[0]));
 }
 
 static int
@@ -647,7 +622,7 @@ hemdistsign(BITVECP a, BITVECP b)
 	LOOPBYTE
 	{
 		diff = (unsigned char) (a[i] ^ b[i]);
-		dist += number_of_ones[diff];
+		dist += popcount_uint8(diff);
 	}
 	return dist;
 }
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index eaab4be..50b98c9 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -88,6 +88,7 @@
 #include "access/heapam_xlog.h"
 #include "access/visibilitymap.h"
 #include "access/xlog.h"
+#include "common/bitutils.h"
 #include "miscadmin.h"
 #include "storage/bufmgr.h"
 #include "storage/lmgr.h"
@@ -112,43 +113,9 @@
 #define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE)
 #define HEAPBLK_TO_MAPBIT(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK)
 
-/* tables for fast counting of set bits for visible and frozen */
-static const uint8 number_of_ones_for_visible[256] = {
-	0, 1, 0, 1, 1, 2, 1, 2, 0, 1, 0, 1, 1, 2, 1, 2,
-	1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3,
-	0, 1, 0, 1, 1, 2, 1, 2, 0, 1, 0, 1, 1, 2, 1, 2,
-	1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3,
-	1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3,
-	2, 3, 2, 3, 3, 4, 3, 4, 2, 3, 2, 3, 3, 4, 3, 4,
-	1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3,
-	2, 3, 2, 3, 3, 4, 3, 4, 2, 3, 2, 3, 3, 4, 3, 4,
-	0, 1, 0, 1, 1, 2, 1, 2, 0, 1, 0, 1, 1, 2, 1, 2,
-	1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3,
-	0, 1, 0, 1, 1, 2, 1, 2, 0, 1, 0, 1, 1, 2, 1, 2,
-	1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3,
-	1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3,
-	2, 3, 2, 3, 3, 4, 3, 4, 2, 3, 2, 3, 3, 4, 3, 4,
-	1, 2, 1, 2, 2, 3, 2, 3, 1, 2, 1, 2, 2, 3, 2, 3,
-	2, 3, 2, 3, 3, 4, 3, 4, 2, 3, 2, 3, 3, 4, 3, 4
-};
-static const uint8 number_of_ones_for_frozen[256] = {
-	0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2,
-	0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2,
-	1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3,
-	1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3,
-	0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2,
-	0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2,
-	1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3,
-	1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3,
-	1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3,
-	1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3,
-	2, 2, 3, 3, 2, 2, 3, 3, 3, 3, 4, 4, 3, 3, 4, 4,
-	2, 2, 3, 3, 2, 2, 3, 3, 3, 3, 4, 4, 3, 3, 4, 4,
-	1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3,
-	1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3,
-	2, 2, 3, 3, 2, 2, 3, 3, 3, 3, 4, 4, 3, 3, 4, 4,
-	2, 2, 3, 3, 2, 2, 3, 3, 3, 3, 4, 4, 3, 3, 4, 4
-};
+/* Masks for bit counting. */
+#define VISIBLE_MASK64 0x5555555555555555 /* The lower bit of each bit pair */
+#define FROZEN_MASK64 0xaaaaaaaaaaaaaaaa /* The upper bit of each bit pair */
 
 /* prototypes for internal routines */
 static Buffer vm_readbuf(Relation rel, BlockNumber blkno, bool extend);
@@ -409,7 +376,7 @@ visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_fro
 	for (mapBlock = 0;; mapBlock++)
 	{
 		Buffer		mapBuffer;
-		unsigned char *map;
+		uint64	   *map;
 		int			i;
 
 		/*
@@ -426,13 +393,15 @@ visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_fro
 		 * immediately stale anyway if anyone is concurrently setting or
 		 * clearing bits, and we only really need an approximate value.
 		 */
-		map = (unsigned char *) PageGetContents(BufferGetPage(mapBuffer));
+		map = (uint64 *) PageGetContents(BufferGetPage(mapBuffer));
 
-		for (i = 0; i < MAPSIZE; i++)
+		StaticAssertStmt(MAPSIZE % sizeof(uint64) == 0,
+						 "unsupported MAPSIZE");
+		for (i = 0; i < MAPSIZE / sizeof(uint64); i++)
 		{
-			*all_visible += number_of_ones_for_visible[map[i]];
+			*all_visible += popcount_uint64(map[i] & VISIBLE_MASK64);
 			if (all_frozen)
-				*all_frozen += number_of_ones_for_frozen[map[i]];
+				*all_frozen += popcount_uint64(map[i] & FROZEN_MASK64);
 		}
 
 		ReleaseBuffer(mapBuffer);
diff --git a/src/backend/nodes/bitmapset.c b/src/backend/nodes/bitmapset.c
index f281cc5..cd9a82a 100644
--- a/src/backend/nodes/bitmapset.c
+++ b/src/backend/nodes/bitmapset.c
@@ -21,7 +21,9 @@
 #include "postgres.h"
 
 #include "access/hash.h"
+#include "common/bitutils.h"
 
+#include <string.h>
 
 #define WORDNUM(x)	((x) / BITS_PER_BITMAPWORD)
 #define BITNUM(x)	((x) % BITS_PER_BITMAPWORD)
@@ -50,58 +52,18 @@
 
 #define HAS_MULTIPLE_ONES(x)	((bitmapword) RIGHTMOST_ONE(x) != (x))
 
-
 /*
- * Lookup tables to avoid need for bit-by-bit groveling
- *
- * rightmost_one_pos[x] gives the bit number (0-7) of the rightmost one bit
- * in a nonzero byte value x.  The entry for x=0 is never used.
- *
- * number_of_ones[x] gives the number of one-bits (0-8) in a byte value x.
- *
- * We could make these tables larger and reduce the number of iterations
- * in the functions that use them, but bytewise shifts and masks are
- * especially fast on many machines, so working a byte at a time seems best.
+ * Find the rightmost (least significant) bit set in a bitmapword.  The least
+ * significant bit is called 0 (unlike the ffs function which call that bit
+ * position 1).  Undefined if there are no bits set.
  */
-
-static const uint8 rightmost_one_pos[256] = {
-	0, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-	4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-	5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-	4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-	6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-	4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-	5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-	4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-	7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-	4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-	5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-	4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-	6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-	4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-	5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-	4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
-};
-
-static const uint8 number_of_ones[256] = {
-	0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
-	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-	4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
-};
-
+static inline int
+rightmost_one_pos(bitmapword word)
+{
+	StaticAssertStmt(sizeof(bitmapword) <= sizeof(int),
+					 "unsupported bitmapword size");
+	return ffs((int) word) - 1;
+}
 
 /*
  * bms_copy - make a palloc'd copy of a bitmapset
@@ -511,12 +473,7 @@ bms_singleton_member(const Bitmapset *a)
 			if (result >= 0 || HAS_MULTIPLE_ONES(w))
 				elog(ERROR, "bitmapset has multiple members");
 			result = wordnum * BITS_PER_BITMAPWORD;
-			while ((w & 255) == 0)
-			{
-				w >>= 8;
-				result += 8;
-			}
-			result += rightmost_one_pos[w & 255];
+			result += rightmost_one_pos(w);
 		}
 	}
 	if (result < 0)
@@ -554,12 +511,7 @@ bms_get_singleton_member(const Bitmapset *a, int *member)
 			if (result >= 0 || HAS_MULTIPLE_ONES(w))
 				return false;
 			result = wordnum * BITS_PER_BITMAPWORD;
-			while ((w & 255) == 0)
-			{
-				w >>= 8;
-				result += 8;
-			}
-			result += rightmost_one_pos[w & 255];
+			result += rightmost_one_pos(w);
 		}
 	}
 	if (result < 0)
@@ -574,25 +526,10 @@ bms_get_singleton_member(const Bitmapset *a, int *member)
 int
 bms_num_members(const Bitmapset *a)
 {
-	int			result = 0;
-	int			nwords;
-	int			wordnum;
-
 	if (a == NULL)
 		return 0;
-	nwords = a->nwords;
-	for (wordnum = 0; wordnum < nwords; wordnum++)
-	{
-		bitmapword	w = a->words[wordnum];
-
-		/* we assume here that bitmapword is an unsigned type */
-		while (w != 0)
-		{
-			result += number_of_ones[w & 255];
-			w >>= 8;
-		}
-	}
-	return result;
+	return popcount(&a->words[0],
+					a->nwords * sizeof(a->words[0]));
 }
 
 /*
@@ -872,12 +809,7 @@ bms_first_member(Bitmapset *a)
 			a->words[wordnum] &= ~w;
 
 			result = wordnum * BITS_PER_BITMAPWORD;
-			while ((w & 255) == 0)
-			{
-				w >>= 8;
-				result += 8;
-			}
-			result += rightmost_one_pos[w & 255];
+			result += rightmost_one_pos(w);
 			return result;
 		}
 	}
@@ -927,12 +859,7 @@ bms_next_member(const Bitmapset *a, int prevbit)
 			int			result;
 
 			result = wordnum * BITS_PER_BITMAPWORD;
-			while ((w & 255) == 0)
-			{
-				w >>= 8;
-				result += 8;
-			}
-			result += rightmost_one_pos[w & 255];
+			result += rightmost_one_pos(w);
 			return result;
 		}
 
diff --git a/src/backend/utils/adt/tsgistidx.c b/src/backend/utils/adt/tsgistidx.c
index cdd5d43..79f8466 100644
--- a/src/backend/utils/adt/tsgistidx.c
+++ b/src/backend/utils/adt/tsgistidx.c
@@ -16,6 +16,7 @@
 
 #include "access/gist.h"
 #include "access/tuptoaster.h"
+#include "common/bitutils.h"
 #include "tsearch/ts_utils.h"
 #include "utils/pg_crc.h"
 
@@ -69,26 +70,6 @@ typedef struct
 #define GETARR(x)	( (int32*)( (char*)(x)+GTHDRSIZE ) )
 #define ARRNELEM(x) ( ( VARSIZE(x) - GTHDRSIZE )/sizeof(int32) )
 
-/* Number of one-bits in an unsigned byte */
-static const uint8 number_of_ones[256] = {
-	0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
-	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
-	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-	4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
-};
-
 static int32 sizebitvec(BITVECP sign);
 
 Datum
@@ -499,12 +480,7 @@ gtsvector_same(PG_FUNCTION_ARGS)
 static int32
 sizebitvec(BITVECP sign)
 {
-	int32		size = 0,
-				i;
-
-	LOOPBYTE
-		size += number_of_ones[(unsigned char) sign[i]];
-	return size;
+	return popcount(&sign[0], SIGLEN * sizeof(sign[0]));
 }
 
 static int
@@ -517,7 +493,7 @@ hemdistsign(BITVECP a, BITVECP b)
 	LOOPBYTE
 	{
 		diff = (unsigned char) (a[i] ^ b[i]);
-		dist += number_of_ones[diff];
+		dist += popcount_uint8(diff);
 	}
 	return dist;
 }
diff --git a/src/common/Makefile b/src/common/Makefile
index 72b7369..42ec487 100644
--- a/src/common/Makefile
+++ b/src/common/Makefile
@@ -36,7 +36,7 @@ override CPPFLAGS += -DVAL_LDFLAGS_EX="\"$(LDFLAGS_EX)\""
 override CPPFLAGS += -DVAL_LDFLAGS_SL="\"$(LDFLAGS_SL)\""
 override CPPFLAGS += -DVAL_LIBS="\"$(LIBS)\""
 
-OBJS_COMMON = config_info.o controldata_utils.o exec.o keywords.o \
+OBJS_COMMON = bitutils.o config_info.o controldata_utils.o exec.o keywords.o \
 	pg_lzcompress.o pgfnames.o psprintf.o relpath.o rmtree.o \
 	string.o username.o wait_error.o
 
diff --git a/src/common/bitutils.c b/src/common/bitutils.c
new file mode 100644
index 0000000..9da062f
--- /dev/null
+++ b/src/common/bitutils.c
@@ -0,0 +1,105 @@
+/*
+ *	bitutils.c
+ *		bit manipulation support
+ *
+ *	Copyright (c) 2016, PostgreSQL Global Development Group
+ *
+ *	src/common/bitutils.c
+ */
+
+#include "postgres.h"
+
+#include "common/bitutils.h"
+
+/* The lookup table used for 8 bit popcounts. */
+const uint8 popcount_uint8_table[256] = {
+	0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+	4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
+};
+
+#if !defined(USE_POPCOUNT_TABLE) && !defined(HAVE_BUILTIN_POPCOUNT)
+
+/*
+ * Wider versions, as functions rather than macros to avoid multiple
+ * evaluation hazards.
+ */
+
+int
+popcount_uint32(uint32 v)
+{
+	return
+		popcount_uint8(v & 0xff) +
+		popcount_uint8((v >> 8) & 0xff) +
+		popcount_uint8((v >> 16) & 0xff) +
+		popcount_uint8(v >> 24);
+}
+
+int
+popcount_uint64(uint64 v)
+{
+	/*
+	 * Maybe some of the other codings from Hacker's Delight, Knuth or
+	 * Beautiful Code would be better?
+	 */
+	return
+		popcount_uint8(v & 0xff) +
+		popcount_uint8((v >> 8) & 0xff) +
+		popcount_uint8((v >> 16) & 0xff) +
+		popcount_uint8((v >> 24) & 0xff) +
+		popcount_uint8((v >> 32) & 0xff) +
+		popcount_uint8((v >> 40) & 0xff) +
+		popcount_uint8((v >> 48) & 0xff) +
+		popcount_uint8(v >> 56);
+}
+
+#endif
+
+/*
+ * Count the number of bits set in a region of memory.  Do so using the widest
+ * size possible, so that builds that use hardware instructions rather than
+ * looking up each byte can get the maximum benefit.  'data' must be correctly
+ * aligned for a uint64.
+ */
+Size
+popcount(const void *data, Size size)
+{
+	Size result = 0;
+	const uint8 *end = ((uint8 *) data) + size;
+	const uint64 *p64;
+	const uint32 *p32;
+	const uint8 *p8;
+
+	/*
+	 * TODO: Not tested, but 32 bit grain might actually be better than 64:
+	 * http://stackoverflow.com/questions/25078285/replacing-a-32-bit-loop-count-variable-with-64-bit-introduces-crazy-performance/
+	 */
+
+	/* Process as many whole uint64s as we can */
+	p64 = (uint64 *) data;
+	while ((uint8 *) (p64 + 1) <= end)
+		result += popcount_uint64(*p64++);
+	/* See if there is a trailing whole uint32 we can process */
+	p32 = (uint32 *) p64;
+	if ((uint8 *) (p32 + 1) <= end)
+		result += popcount_uint32(*p32++);
+	/* Mop up any remaining bytes */
+	p8 = (uint8 *) p32;
+	while (p8 < end)
+		result += popcount_uint8(*p8++);
+
+	return result;
+}
diff --git a/src/include/common/bitutils.h b/src/include/common/bitutils.h
new file mode 100644
index 0000000..492fbc9
--- /dev/null
+++ b/src/include/common/bitutils.h
@@ -0,0 +1,34 @@
+/*
+ *	bitutils.h
+ * 		bit manipulation support
+ *
+ *	Copyright (c) 2016, PostgreSQL Global Development Group
+ *
+ *	src/include/common/bitutils.h
+ */
+#ifndef BITUTILS_H
+#define BITUTILS_H
+
+/* Use a private lookup table for the per-byte population count. */
+extern const uint8 popcount_uint8_table[256];
+#define popcount_uint8(v) popcount_uint8_table[(v)]
+
+#if !defined(POPCOUNT_USE_TABLE) && defined(HAVE_BUILTIN_POPCOUNT)
+
+/* Use GCC built-ins for wider sizes */
+#define popcount_uint32(v)						\
+	__builtin_popcount(v)
+#define popcount_uint64(v)						\
+	__builtin_popcountll(v)
+
+#else
+
+/* Use functions for the wider sizes to avoid multiple evaluation hazards. */
+int popcount_uint32(uint32 v);
+int popcount_uint64(uint64 v);
+
+#endif
+
+extern Size popcount(const void *data, Size size);
+
+#endif   /* BITUTILS_H */
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index b621ff2..f66231a 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -90,6 +90,9 @@
 /* Define to 1 if you have the <atomic.h> header file. */
 #undef HAVE_ATOMIC_H
 
+/* Define to 1 if you have __builtin_popcount. */
+#undef HAVE_BUILTIN_POPCOUNT
+
 /* Define to 1 if you have the `cbrt' function. */
 #undef HAVE_CBRT
 
