Re: [HACKERS] WIP: dynahash replacement for buffer table

Robert Haas Mon, 26 Jan 2015 20:22:07 -0800

This developed a slight merge conflict.  I've rebased the attached
version, and I also took the step of getting rid of buf_table.c, as I
think I proposed somewhere upthread.  This avoids the overhead of
constructing a BufferTag only to copy it into a BufferLookupEnt, plus
some function calls and so forth.  A quick-and-dirty test suggests
this might not have cut down on the 1-client overhead much, but I
think it's worth doing anyway: it's certainly saving a few cycles, and
I don't think it's complicating anything measurably.


-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

diff --git a/contrib/Makefile b/contrib/Makefile
index 195d447..141ef0a 100644
--- a/contrib/Makefile
+++ b/contrib/Makefile
@@ -19,6 +19,7 @@ SUBDIRS = \
 		earthdistance	\
 		file_fdw	\
 		fuzzystrmatch	\
+		hashtest	\
 		hstore		\
 		intagg		\
 		intarray	\
diff --git a/contrib/hashtest/Makefile b/contrib/hashtest/Makefile
new file mode 100644
index 0000000..3ee42f8
--- /dev/null
+++ b/contrib/hashtest/Makefile
@@ -0,0 +1,18 @@
+# contrib/hashtest/Makefile
+
+MODULE_big = hashtest
+OBJS = hashtest.o
+
+EXTENSION = hashtest
+DATA = hashtest--1.0.sql
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/hashtest
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/hashtest/hashtest--1.0.sql b/contrib/hashtest/hashtest--1.0.sql
new file mode 100644
index 0000000..e271baf
--- /dev/null
+++ b/contrib/hashtest/hashtest--1.0.sql
@@ -0,0 +1,52 @@
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION hashtest" to load this file. \quit
+
+CREATE FUNCTION chash_insert_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'chash_insert_test'
+LANGUAGE C;
+
+CREATE FUNCTION chash_search_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'chash_search_test'
+LANGUAGE C;
+
+CREATE FUNCTION chash_delete_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'chash_delete_test'
+LANGUAGE C;
+
+CREATE FUNCTION chash_concurrent_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'chash_concurrent_test'
+LANGUAGE C;
+
+CREATE FUNCTION chash_collision_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'chash_collision_test'
+LANGUAGE C;
+
+CREATE FUNCTION dynahash_insert_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'dynahash_insert_test'
+LANGUAGE C;
+
+CREATE FUNCTION dynahash_search_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'dynahash_search_test'
+LANGUAGE C;
+
+CREATE FUNCTION dynahash_delete_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'dynahash_delete_test'
+LANGUAGE C;
+
+CREATE FUNCTION dynahash_concurrent_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'dynahash_concurrent_test'
+LANGUAGE C;
+
+CREATE FUNCTION dynahash_collision_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'dynahash_collision_test'
+LANGUAGE C;
diff --git a/contrib/hashtest/hashtest.c b/contrib/hashtest/hashtest.c
new file mode 100644
index 0000000..172a5bb
--- /dev/null
+++ b/contrib/hashtest/hashtest.c
@@ -0,0 +1,527 @@
+/*-------------------------------------------------------------------------
+ * hashtest.c
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "funcapi.h"
+#include "libpq/auth.h"
+#include "lib/stringinfo.h"
+#include "miscadmin.h"
+#include "portability/instr_time.h"
+#include "storage/ipc.h"
+#include "utils/chash.h"
+
+PG_MODULE_MAGIC;
+
+void		_PG_init(void);
+Datum		chash_insert_test(PG_FUNCTION_ARGS);
+Datum		chash_search_test(PG_FUNCTION_ARGS);
+Datum		chash_delete_test(PG_FUNCTION_ARGS);
+Datum		chash_concurrent_test(PG_FUNCTION_ARGS);
+Datum		chash_collision_test(PG_FUNCTION_ARGS);
+Datum		dynahash_insert_test(PG_FUNCTION_ARGS);
+Datum		dynahash_search_test(PG_FUNCTION_ARGS);
+Datum		dynahash_delete_test(PG_FUNCTION_ARGS);
+Datum		dynahash_concurrent_test(PG_FUNCTION_ARGS);
+Datum		dynahash_collision_test(PG_FUNCTION_ARGS);
+static void hashtest_shmem_startup(void);
+
+PG_FUNCTION_INFO_V1(chash_insert_test);
+PG_FUNCTION_INFO_V1(chash_search_test);
+PG_FUNCTION_INFO_V1(chash_delete_test);
+PG_FUNCTION_INFO_V1(chash_concurrent_test);
+PG_FUNCTION_INFO_V1(chash_collision_test);
+PG_FUNCTION_INFO_V1(dynahash_insert_test);
+PG_FUNCTION_INFO_V1(dynahash_search_test);
+PG_FUNCTION_INFO_V1(dynahash_delete_test);
+PG_FUNCTION_INFO_V1(dynahash_concurrent_test);
+PG_FUNCTION_INFO_V1(dynahash_collision_test);
+
+typedef struct
+{
+	uint32	key;
+	uint32	val;
+} hentry;
+
+static CHashDescriptor cdesc = {
+	"hashtest-chash",	/* name */
+	1048576,			/* capacity */
+	sizeof(hentry),		/* element size */
+	sizeof(uint32)		/* key size */
+};
+
+#define DYNAHASH_PARTITIONS		16
+
+static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
+static CHashTable chash;
+static HTAB *dynahash;
+static LWLockId dynahash_lock[DYNAHASH_PARTITIONS];
+static ClientAuthentication_hook_type original_client_auth_hook = NULL;
+
+static void hashtest_client_auth_hook(Port *port, int status);
+static void chash_write_stats_to_log(int code, Datum dummy);
+
+#define dynahash_get_lock(hashcode) \
+	(dynahash_lock[(hashcode) % DYNAHASH_PARTITIONS])
+
+void
+_PG_init(void)
+{
+	Size	cs;
+	Size	ds;
+
+	if (!process_shared_preload_libraries_in_progress)
+		return;
+	prev_shmem_startup_hook = shmem_startup_hook;
+	shmem_startup_hook = hashtest_shmem_startup;
+	chash = CHashBootstrap(&cdesc);
+	cs = CHashEstimateSize(chash);
+	RequestAddinShmemSpace(cs);
+	ds = hash_estimate_size(cdesc.capacity, cdesc.element_size);
+	RequestAddinShmemSpace(ds);
+	elog(LOG, "chash: %u bytes; dynahash: %u bytes", (unsigned) cs,
+		 (unsigned) ds);
+	RequestAddinLWLocks(DYNAHASH_PARTITIONS);
+	original_client_auth_hook = ClientAuthentication_hook;
+	ClientAuthentication_hook = hashtest_client_auth_hook;
+	
+}
+
+static void
+hashtest_client_auth_hook(Port *port, int status)
+{
+	if (original_client_auth_hook)
+		original_client_auth_hook(port, status);
+	on_proc_exit(chash_write_stats_to_log, (Datum) 0);
+}
+
+static void
+chash_write_stats_to_log(int code, Datum dummy)
+{
+	uint64	stats[CHS_NumberOfStatistics];
+	CHashStatisticsType i;
+	StringInfoData	buf;
+
+	CHashStatistics(chash, stats);
+	initStringInfo(&buf);
+
+	for (i = 0; i < CHS_NumberOfStatistics; ++i)
+	{
+		if (stats[i] == 0)
+			continue;
+		appendStringInfo(&buf, UINT64_FORMAT " %s; ", stats[i],
+						 CHashStatisticsNames[i]);
+	}
+
+	if (buf.len > 1)
+	{
+		buf.data[buf.len-2] = '\0';
+		elog(LOG, "chash statistics: %s", buf.data);
+	}
+}
+
+static void
+hashtest_shmem_startup(void)
+{
+	HASHCTL		info;
+	uint32		i;
+
+	if (prev_shmem_startup_hook)
+		prev_shmem_startup_hook();
+
+	/* Initialize concurrent hash table. */
+	chash = CHashInitialize(chash, &cdesc);
+
+	/* Initialize shared dynahash table. */
+	info.keysize = cdesc.key_size;
+	info.entrysize = cdesc.element_size;
+	info.hash = tag_hash;
+	info.num_partitions = DYNAHASH_PARTITIONS;
+
+	dynahash = ShmemInitHash("hashtest-dynahash",
+							 cdesc.capacity, cdesc.capacity,
+							 &info,
+							 HASH_ELEM | HASH_FUNCTION | HASH_PARTITION);
+
+	for (i = 0; i < DYNAHASH_PARTITIONS; ++i)
+		dynahash_lock[i] = LWLockAssign();
+}
+
+Datum
+chash_insert_test(PG_FUNCTION_ARGS)
+{
+	uint32	i;
+	hentry	e;
+
+	for (i = 0; i < 1000000; ++i)
+	{
+		bool ok;
+
+		e.key = i;
+		e.val = i * 31;
+		ok = CHashInsert(chash, &e);
+		if (!ok)
+			elog(LOG, "insert %u: failed", i);
+		ok = CHashInsert(chash, &e);
+		if (ok)
+			elog(LOG, "insert %u: worked twice", i);
+	}
+
+	PG_RETURN_VOID();
+}
+
+Datum
+chash_search_test(PG_FUNCTION_ARGS)
+{
+	uint32	i;
+	hentry	e;
+
+	for (i = 0; i < 1000000; ++i)
+	{
+		bool ok;
+
+		e.key = i;
+		ok = CHashSearch(chash, &e);
+		if (!ok)
+			elog(LOG, "search %u: not found", i);
+		else if (e.val != e.key * 31)
+			elog(LOG, "search %u: found %u", i, e.val);
+	}
+
+	PG_RETURN_VOID();
+}
+
+Datum
+chash_delete_test(PG_FUNCTION_ARGS)
+{
+	uint32	i;
+	hentry	e;
+
+	for (i = 0; i < 1000000; ++i)
+	{
+		bool ok;
+
+		e.key = i;
+		ok = CHashDelete(chash, &e);
+		if (!ok)
+			elog(LOG, "delete %u: not found", i);
+		ok = CHashDelete(chash, &e);
+		if (ok)
+			elog(LOG, "delete %u: found twice", i);
+	}
+
+	PG_RETURN_VOID();
+}
+
+Datum
+chash_concurrent_test(PG_FUNCTION_ARGS)
+{
+	uint32	i;
+	hentry	e;
+	uint32	seed = MyProcPid << 16;
+
+	for (i = 0; i < 10000; ++i)
+	{
+		bool ok;
+
+		e.key = seed | i;
+		e.val = MyProcPid;
+		ok = CHashInsert(chash, &e);
+		if (!ok)
+			elog(LOG, "insert %u: found", i);
+	}
+
+	for (i = 0; i < 10000; ++i)
+	{
+		bool ok;
+
+		e.key = seed | i;
+		e.val = 0;
+		ok = CHashSearch(chash, &e);
+		if (!ok)
+		{
+			uint64	retry = 1;
+			elog(LOG, "search %u: not found", i);
+			while (!CHashSearch(chash, &e))
+				++retry;
+			elog(LOG, "search %u: eventually found it after "
+				UINT64_FORMAT " retries", i, retry);
+		}
+		if (e.val != MyProcPid)
+			elog(LOG, "search %u: expected %u found %u", i, (unsigned) MyProcPid, e.val);
+	}
+
+	for (i = 0; i < 10000; ++i)
+	{
+		bool ok;
+
+		e.key = seed | i;
+		ok = CHashDelete(chash, &e);
+		if (!ok)
+		{
+			uint64	retry = 1;
+			elog(LOG, "delete %u: not found", i);
+			while (!CHashDelete(chash, &e))
+				++retry;
+			elog(LOG, "delete %u: eventually deleted it after "
+				UINT64_FORMAT " retries", i, retry);
+		}
+	}
+
+	PG_RETURN_VOID();
+}
+
+Datum
+chash_collision_test(PG_FUNCTION_ARGS)
+{
+	uint32	i;
+	hentry	e;
+
+	/* Don't stack-allocate this. */
+	static bool mine[10000];
+
+	memset(mine, 0, 10000 * sizeof(bool));
+
+	for (i = 0; i < 10000; ++i)
+	{
+		bool ok;
+
+		e.key = i;
+		e.val = MyProcPid;
+		ok = CHashInsert(chash, &e);
+		if (ok)
+			mine[i] = true;
+	}
+
+	for (i = 0; i < 10000; ++i)
+	{
+		bool ok;
+
+		if (!mine[i])
+			continue;
+		e.key = i;
+		ok = CHashSearch(chash, &e);
+		if (!ok)
+			elog(LOG, "search %u: not found", i);
+		else if (e.val != MyProcPid)
+			elog(LOG, "search %u: expected %u found %u",
+				 i, (unsigned) MyProcPid, e.val);
+		ok = CHashDelete(chash, &e);
+		if (!ok)
+			elog(LOG, "delete %u: not found", i);
+	}
+
+	PG_RETURN_VOID();
+}
+
+static bool
+dynahash_insert(uint32 key, uint32 val)
+{
+	bool	found;
+	uint32	hashcode;
+	hentry *e;
+	LWLockId	lockid;
+
+	hashcode = get_hash_value(dynahash, (void *) &key);
+ 	lockid = dynahash_get_lock(hashcode);
+	LWLockAcquire(lockid, LW_EXCLUSIVE);
+	e = hash_search_with_hash_value(dynahash, (void *) &key,
+									hashcode, HASH_ENTER, &found);
+	if (!found)
+		e->val = val;
+	LWLockRelease(lockid);
+
+	return !found;
+}
+
+static bool
+dynahash_search(uint32 key, uint32 *val)
+{
+	uint32	hashcode;
+	hentry *e;
+	LWLockId	lockid;
+
+	hashcode = get_hash_value(dynahash, (void *) &key);
+ 	lockid = dynahash_get_lock(hashcode);
+	LWLockAcquire(lockid, LW_SHARED);
+	e = hash_search_with_hash_value(dynahash, (void *) &key,
+									hashcode, HASH_FIND, NULL);
+	if (e)
+		*val = e->val;
+	LWLockRelease(lockid);
+
+	return e != NULL;
+}
+
+static bool
+dynahash_delete(uint32 key)
+{
+	uint32	hashcode;
+	hentry *e;
+	LWLockId	lockid;
+
+	hashcode = get_hash_value(dynahash, (void *) &key);
+ 	lockid = dynahash_get_lock(hashcode);
+	LWLockAcquire(lockid, LW_EXCLUSIVE);
+	e = hash_search_with_hash_value(dynahash, (void *) &key,
+									hashcode, HASH_REMOVE, NULL);
+	LWLockRelease(lockid);
+
+	return e != NULL;
+}
+
+Datum
+dynahash_insert_test(PG_FUNCTION_ARGS)
+{
+	uint32	i;
+
+	for (i = 0; i < 1000000; ++i)
+	{
+		bool	ok;
+
+		ok = dynahash_insert(i, i * 31);
+		if (!ok)
+			elog(LOG, "insert %u: failed", i);
+		ok = dynahash_insert(i, i * 31);
+		if (ok)
+			elog(LOG, "insert %u: worked twice", i);
+	}
+
+	PG_RETURN_VOID();
+}
+
+Datum
+dynahash_search_test(PG_FUNCTION_ARGS)
+{
+	uint32	i;
+
+	for (i = 0; i < 1000000; ++i)
+	{
+		bool	ok;
+		uint32	val;
+
+		ok = dynahash_search(i, &val);
+		if (!ok)
+			elog(LOG, "search %u: not found", i);
+		else if (val != i* 31)
+			elog(LOG, "search %u: found %u", i, val);
+	}
+
+	PG_RETURN_VOID();
+}
+
+Datum
+dynahash_delete_test(PG_FUNCTION_ARGS)
+{
+	uint32	i;
+
+	for (i = 0; i < 1000000; ++i)
+	{
+		bool	ok;
+
+		ok = dynahash_delete(i);
+		if (!ok)
+			elog(LOG, "delete %u: not found", i);
+		ok = dynahash_delete(i);
+		if (ok)
+			elog(LOG, "delete %u: found twice", i);
+	}
+
+	PG_RETURN_VOID();
+}
+
+Datum
+dynahash_concurrent_test(PG_FUNCTION_ARGS)
+{
+	uint32	i;
+	uint32	val;
+	uint32	seed = MyProcPid << 16;
+
+	for (i = 0; i < 10000; ++i)
+	{
+		bool ok;
+
+		ok = dynahash_insert(seed | i, MyProcPid);
+		if (!ok)
+			elog(LOG, "insert %u: found", i);
+	}
+
+	for (i = 0; i < 10000; ++i)
+	{
+		bool ok;
+
+		ok = dynahash_search(seed | i, &val);
+		if (!ok)
+		{
+			uint64	retry = 1;
+			elog(LOG, "search %u: not found", i);
+			while (!dynahash_search(seed | i, &val))
+				++retry;
+			elog(LOG, "search %u: eventually found it after "
+				UINT64_FORMAT " retries", i, retry);
+		}
+		if (val != MyProcPid)
+			elog(LOG, "search %u: expected %u found %u",
+				 i, (unsigned) MyProcPid, val);
+	}
+
+	for (i = 0; i < 10000; ++i)
+	{
+		bool ok;
+
+		ok = dynahash_delete(seed | i);
+		if (!ok)
+		{
+			uint64	retry = 1;
+			elog(LOG, "delete %u: not found", i);
+			while (!dynahash_delete(seed | i))
+				++retry;
+			elog(LOG, "delete %u: eventually deleted it after "
+				UINT64_FORMAT " retries", i, retry);
+		}
+	}
+
+	PG_RETURN_VOID();
+}
+
+Datum
+dynahash_collision_test(PG_FUNCTION_ARGS)
+{
+	uint32	i;
+	uint32	val;
+
+	/* Don't stack-allocate this. */
+	static bool mine[10000];
+
+	memset(mine, 0, 10000 * sizeof(bool));
+
+	for (i = 0; i < 10000; ++i)
+	{
+		bool ok;
+
+		ok = dynahash_insert(i, MyProcPid);
+		if (ok)
+			mine[i] = true;
+	}
+
+	for (i = 0; i < 10000; ++i)
+	{
+		bool ok;
+
+		if (!mine[i])
+			continue;
+		ok = dynahash_search(i, &val);
+		if (!ok)
+			elog(LOG, "search %u: not found", i);
+		else if (val != MyProcPid)
+			elog(LOG, "search %u: expected %u found %u",
+				 i, (unsigned) MyProcPid, val);
+		ok = dynahash_delete(i);
+		if (!ok)
+			elog(LOG, "delete %u: not found", i);
+	}
+
+	PG_RETURN_VOID();
+}
diff --git a/contrib/hashtest/hashtest.control b/contrib/hashtest/hashtest.control
new file mode 100644
index 0000000..b8e0f01
--- /dev/null
+++ b/contrib/hashtest/hashtest.control
@@ -0,0 +1,4 @@
+comment = 'hash testing code'
+default_version = '1.0'
+module_pathname = '$libdir/hashtest'
+relocatable = true
diff --git a/src/backend/storage/buffer/Makefile b/src/backend/storage/buffer/Makefile
index 2c10fba..b30a0da 100644
--- a/src/backend/storage/buffer/Makefile
+++ b/src/backend/storage/buffer/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/storage/buffer
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = buf_table.o buf_init.o bufmgr.o freelist.o localbuf.o
+OBJS = buf_init.o bufmgr.o freelist.o localbuf.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README
index a4ebbcc..86697e9 100644
--- a/src/backend/storage/buffer/README
+++ b/src/backend/storage/buffer/README
@@ -100,30 +100,10 @@ Buffer Manager's Internal Locking
 
 Before PostgreSQL 8.1, all operations of the shared buffer manager itself
 were protected by a single system-wide lock, the BufMgrLock, which
-unsurprisingly proved to be a source of contention.  The new locking scheme
-avoids grabbing system-wide exclusive locks in common code paths.  It works
-like this:
-
-* There is a system-wide LWLock, the BufMappingLock, that notionally
-protects the mapping from buffer tags (page identifiers) to buffers.
-(Physically, it can be thought of as protecting the hash table maintained
-by buf_table.c.)  To look up whether a buffer exists for a tag, it is
-sufficient to obtain share lock on the BufMappingLock.  Note that one
-must pin the found buffer, if any, before releasing the BufMappingLock.
-To alter the page assignment of any buffer, one must hold exclusive lock
-on the BufMappingLock.  This lock must be held across adjusting the buffer's
-header fields and changing the buf_table hash table.  The only common
-operation that needs exclusive lock is reading in a page that was not
-in shared buffers already, which will require at least a kernel call
-and usually a wait for I/O, so it will be slow anyway.
-
-* As of PG 8.2, the BufMappingLock has been split into NUM_BUFFER_PARTITIONS
-separate locks, each guarding a portion of the buffer tag space.  This allows
-further reduction of contention in the normal code paths.  The partition
-that a particular buffer tag belongs to is determined from the low-order
-bits of the tag's hash value.  The rules stated above apply to each partition
-independently.  If it is necessary to lock more than one partition at a time,
-they must be locked in partition-number order to avoid risk of deadlock.
+unsurprisingly proved to be a source of contention.  In subsequent releases,
+this lock was split into NUM_BUFFER_PARTITIONS locks, each guarding a portion
+of the buffer tag space.  Even this proved to be too much contention, so
+now we use a highly concurrent hashtable (see chash.c and chash.h).
 
 * A separate system-wide spinlock, buffer_strategy_lock, provides mutual
 exclusion for operations that access the buffer free list or select
diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c
deleted file mode 100644
index 6ed47d5..0000000
--- a/src/backend/storage/buffer/buf_table.c
+++ /dev/null
@@ -1,163 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * buf_table.c
- *	  routines for mapping BufferTags to buffer indexes.
- *
- * Note: the routines in this file do no locking of their own.  The caller
- * must hold a suitable lock on the appropriate BufMappingLock, as specified
- * in the comments.  We can't do the locking inside these functions because
- * in most cases the caller needs to adjust the buffer header contents
- * before the lock is released (see notes in README).
- *
- *
- * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
- * Portions Copyright (c) 1994, Regents of the University of California
- *
- *
- * IDENTIFICATION
- *	  src/backend/storage/buffer/buf_table.c
- *
- *-------------------------------------------------------------------------
- */
-#include "postgres.h"
-
-#include "storage/bufmgr.h"
-#include "storage/buf_internals.h"
-
-
-/* entry for buffer lookup hashtable */
-typedef struct
-{
-	BufferTag	key;			/* Tag of a disk page */
-	int			id;				/* Associated buffer ID */
-} BufferLookupEnt;
-
-static HTAB *SharedBufHash;
-
-
-/*
- * Estimate space needed for mapping hashtable
- *		size is the desired hash table size (possibly more than NBuffers)
- */
-Size
-BufTableShmemSize(int size)
-{
-	return hash_estimate_size(size, sizeof(BufferLookupEnt));
-}
-
-/*
- * Initialize shmem hash table for mapping buffers
- *		size is the desired hash table size (possibly more than NBuffers)
- */
-void
-InitBufTable(int size)
-{
-	HASHCTL		info;
-
-	/* assume no locking is needed yet */
-
-	/* BufferTag maps to Buffer */
-	info.keysize = sizeof(BufferTag);
-	info.entrysize = sizeof(BufferLookupEnt);
-	info.num_partitions = NUM_BUFFER_PARTITIONS;
-
-	SharedBufHash = ShmemInitHash("Shared Buffer Lookup Table",
-								  size, size,
-								  &info,
-								  HASH_ELEM | HASH_BLOBS | HASH_PARTITION);
-}
-
-/*
- * BufTableHashCode
- *		Compute the hash code associated with a BufferTag
- *
- * This must be passed to the lookup/insert/delete routines along with the
- * tag.  We do it like this because the callers need to know the hash code
- * in order to determine which buffer partition to lock, and we don't want
- * to do the hash computation twice (hash_any is a bit slow).
- */
-uint32
-BufTableHashCode(BufferTag *tagPtr)
-{
-	return get_hash_value(SharedBufHash, (void *) tagPtr);
-}
-
-/*
- * BufTableLookup
- *		Lookup the given BufferTag; return buffer ID, or -1 if not found
- *
- * Caller must hold at least share lock on BufMappingLock for tag's partition
- */
-int
-BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
-{
-	BufferLookupEnt *result;
-
-	result = (BufferLookupEnt *)
-		hash_search_with_hash_value(SharedBufHash,
-									(void *) tagPtr,
-									hashcode,
-									HASH_FIND,
-									NULL);
-
-	if (!result)
-		return -1;
-
-	return result->id;
-}
-
-/*
- * BufTableInsert
- *		Insert a hashtable entry for given tag and buffer ID,
- *		unless an entry already exists for that tag
- *
- * Returns -1 on successful insertion.  If a conflicting entry exists
- * already, returns the buffer ID in that entry.
- *
- * Caller must hold exclusive lock on BufMappingLock for tag's partition
- */
-int
-BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
-{
-	BufferLookupEnt *result;
-	bool		found;
-
-	Assert(buf_id >= 0);		/* -1 is reserved for not-in-table */
-	Assert(tagPtr->blockNum != P_NEW);	/* invalid tag */
-
-	result = (BufferLookupEnt *)
-		hash_search_with_hash_value(SharedBufHash,
-									(void *) tagPtr,
-									hashcode,
-									HASH_ENTER,
-									&found);
-
-	if (found)					/* found something already in the table */
-		return result->id;
-
-	result->id = buf_id;
-
-	return -1;
-}
-
-/*
- * BufTableDelete
- *		Delete the hashtable entry for given tag (which must exist)
- *
- * Caller must hold exclusive lock on BufMappingLock for tag's partition
- */
-void
-BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
-{
-	BufferLookupEnt *result;
-
-	result = (BufferLookupEnt *)
-		hash_search_with_hash_value(SharedBufHash,
-									(void *) tagPtr,
-									hashcode,
-									HASH_REMOVE,
-									NULL);
-
-	if (!result)				/* shouldn't happen */
-		elog(ERROR, "shared buffer hash table corrupted");
-}
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 7eb2d22..4435b3e 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -24,9 +24,7 @@
  * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
  *		The disk write is delayed until buffer replacement or checkpoint.
  *
- * See also these files:
- *		freelist.c -- chooses victim for buffer replacement
- *		buf_table.c -- manages the buffer lookup table
+ * See also freelist.c, which chooses victim for buffer replacement
  */
 #include "postgres.h"
 
@@ -47,10 +45,25 @@
 #include "storage/proc.h"
 #include "storage/smgr.h"
 #include "storage/standby.h"
+#include "utils/chash.h"
 #include "utils/rel.h"
 #include "utils/resowner_private.h"
 #include "utils/timestamp.h"
 
+/* entry for buffer lookup hashtable */
+typedef struct
+{
+	BufferTag	key;			/* Tag of a disk page */
+	int			id;				/* Associated buffer ID */
+} BufferLookupEnt;
+
+static CHashDescriptor SharedBufDescriptor = {
+	"buffer lookup table",
+	0,
+	sizeof(BufferLookupEnt),
+	sizeof(BufferTag)
+};
+static CHashTable SharedBufHash;
 
 /* Note: these two macros only work on shared buffers, not local ones! */
 #define BufHdrGetBlock(bufHdr)	((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
@@ -138,6 +151,38 @@ static inline int32 GetPrivateRefCount(Buffer buffer);
 static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref);
 
 /*
+ * Estimate space needed for mapping hashtable
+ *		size is the desired hash table size (possibly more than NBuffers)
+ */
+Size
+BufMgrShmemSize(int size)
+{
+	if (SharedBufHash == NULL)
+	{
+		SharedBufDescriptor.capacity = size;
+		SharedBufHash = CHashBootstrap(&SharedBufDescriptor);
+	}
+
+	return CHashEstimateSize(SharedBufHash);
+}
+
+/*
+ * Initialize shmem hash table for mapping buffers
+ *		size is the desired hash table size (possibly more than NBuffers)
+ */
+void
+BufMgrInitShmem(int size)
+{
+	if (SharedBufHash == NULL || !IsUnderPostmaster)
+	{
+		Assert(SharedBufDescriptor.capacity == 0 ||
+			SharedBufDescriptor.capacity == size);
+		SharedBufDescriptor.capacity = size;
+		SharedBufHash = CHashInitialize(SharedBufHash, &SharedBufDescriptor);
+	}
+}
+
+/*
  * Ensure that the the PrivateRefCountArray has sufficient space to store one
  * more entry. This has to be called before using NewPrivateRefCountEntry() to
  * fill a new entry - but it's perfectly fine to not use a reserved entry.
@@ -444,26 +489,14 @@ PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
 	}
 	else
 	{
-		BufferTag	newTag;		/* identity of requested block */
-		uint32		newHash;	/* hash value for newTag */
-		LWLock	   *newPartitionLock;	/* buffer partition lock for it */
-		int			buf_id;
+		BufferLookupEnt	ent;	/* identity of requested block */
 
 		/* create a tag so we can lookup the buffer */
-		INIT_BUFFERTAG(newTag, reln->rd_smgr->smgr_rnode.node,
+		INIT_BUFFERTAG(ent.key, reln->rd_smgr->smgr_rnode.node,
 					   forkNum, blockNum);
 
-		/* determine its hash code and partition lock ID */
-		newHash = BufTableHashCode(&newTag);
-		newPartitionLock = BufMappingPartitionLock(newHash);
-
-		/* see if the block is in the buffer pool already */
-		LWLockAcquire(newPartitionLock, LW_SHARED);
-		buf_id = BufTableLookup(&newTag, newHash);
-		LWLockRelease(newPartitionLock);
-
 		/* If not in buffers, initiate prefetch */
-		if (buf_id < 0)
+		if (!CHashSearch(SharedBufHash, &ent))
 			smgrprefetch(reln->rd_smgr, forkNum, blockNum);
 
 		/*
@@ -870,43 +903,39 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 			BufferAccessStrategy strategy,
 			bool *foundPtr)
 {
-	BufferTag	newTag;			/* identity of requested block */
-	uint32		newHash;		/* hash value for newTag */
-	LWLock	   *newPartitionLock;		/* buffer partition lock for it */
-	BufferTag	oldTag;			/* previous identity of selected buffer */
-	uint32		oldHash;		/* hash value for oldTag */
-	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
+	BufferLookupEnt	newEnt;		/* identity of requested block */
+	BufferLookupEnt oldEnt;		/* previous identity of selected buffer */
 	BufFlags	oldFlags;
-	int			buf_id;
 	volatile BufferDesc *buf;
 	bool		valid;
 
 	/* create a tag so we can lookup the buffer */
-	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
-
-	/* determine its hash code and partition lock ID */
-	newHash = BufTableHashCode(&newTag);
-	newPartitionLock = BufMappingPartitionLock(newHash);
+	INIT_BUFFERTAG(newEnt.key, smgr->smgr_rnode.node, forkNum, blockNum);
 
 	/* see if the block is in the buffer pool already */
-	LWLockAcquire(newPartitionLock, LW_SHARED);
-	buf_id = BufTableLookup(&newTag, newHash);
-	if (buf_id >= 0)
+start:
+	if (CHashSearch(SharedBufHash, &newEnt))
 	{
+		BufferDesc *foundbuf;
+
 		/*
 		 * Found it.  Now, pin the buffer so no one can steal it from the
-		 * buffer pool, and check to see if the correct data has been loaded
-		 * into the buffer.
+		 * buffer pool.
 		 */
-		buf = &BufferDescriptors[buf_id];
+		foundbuf = &BufferDescriptors[newEnt.id];
 
-		valid = PinBuffer(buf, strategy);
+		valid = PinBuffer(foundbuf, strategy);
 
-		/* Can release the mapping lock as soon as we've pinned it */
-		LWLockRelease(newPartitionLock);
+		/* Check whether someone recycled the buffer before we pinned it. */
+		if (!BUFFERTAGS_EQUAL(newEnt.key, foundbuf->tag))
+		{
+			UnpinBuffer(foundbuf, true);
+			goto start;
+		}
 
 		*foundPtr = TRUE;
 
+		/* Check to see if the correct data has been loaded into the buffer. */
 		if (!valid)
 		{
 			/*
@@ -916,7 +945,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 			 * own read attempt if the page is still not BM_VALID.
 			 * StartBufferIO does it all.
 			 */
-			if (StartBufferIO(buf, true))
+			if (StartBufferIO(foundbuf, true))
 			{
 				/*
 				 * If we get here, previous attempts to read the buffer must
@@ -926,15 +955,9 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 			}
 		}
 
-		return buf;
+		return foundbuf;
 	}
 
-	/*
-	 * Didn't find it in the buffer pool.  We'll have to initialize a new
-	 * buffer.  Remember to unlock the mapping lock while doing the work.
-	 */
-	LWLockRelease(newPartitionLock);
-
 	/* Loop here in case we have to try another victim buffer */
 	for (;;)
 	{
@@ -1041,42 +1064,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		 */
 		if (oldFlags & BM_TAG_VALID)
 		{
-			/*
-			 * Need to compute the old tag's hashcode and partition lock ID.
-			 * XXX is it worth storing the hashcode in BufferDesc so we need
-			 * not recompute it here?  Probably not.
-			 */
-			oldTag = buf->tag;
-			oldHash = BufTableHashCode(&oldTag);
-			oldPartitionLock = BufMappingPartitionLock(oldHash);
-
-			/*
-			 * Must lock the lower-numbered partition first to avoid
-			 * deadlocks.
-			 */
-			if (oldPartitionLock < newPartitionLock)
-			{
-				LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
-				LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
-			}
-			else if (oldPartitionLock > newPartitionLock)
-			{
-				LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
-				LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
-			}
-			else
-			{
-				/* only one partition, only one lock */
-				LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
-			}
-		}
-		else
-		{
-			/* if it wasn't valid, we need only the new partition */
-			LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
-			/* these just keep the compiler quiet about uninit variables */
-			oldHash = 0;
-			oldPartitionLock = 0;
+			/* Save old tag. */
+			oldEnt.key = buf->tag;
 		}
 
 		/*
@@ -1086,32 +1075,33 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		 * Note that we have not yet removed the hashtable entry for the old
 		 * tag.
 		 */
-		buf_id = BufTableInsert(&newTag, newHash, buf->buf_id);
-
-		if (buf_id >= 0)
+enter:
+		newEnt.id = buf->buf_id;
+		if (!CHashInsert(SharedBufHash, &newEnt))
 		{
+			BufferDesc *foundbuf;
+
+			/*
+			 * We've got a collision, apparently: it looks like someone else
+			 * did what we were about to do.  We can handle this as if we had
+			 * found the buffer in the pool in the first place, but we must
+			 * recheck the buffer tag after pinning it, because it could still
+			 * get renamed under us.
+		 	 */
+			foundbuf = &BufferDescriptors[newEnt.id];
+			valid = PinBuffer(foundbuf, strategy);
+			if (!BUFFERTAGS_EQUAL(newEnt.key, foundbuf->tag))
+			{
+				UnpinBuffer(foundbuf, true);
+				goto enter;
+			}
+
 			/*
-			 * Got a collision. Someone has already done what we were about to
-			 * do. We'll just handle this as if it were found in the buffer
-			 * pool in the first place.  First, give up the buffer we were
-			 * planning to use.
+			 * Collision confirmed.  Give up the buffer we were planning to
+			 * use.
 			 */
 			UnpinBuffer(buf, true);
 
-			/* Can give up that buffer's mapping partition lock now */
-			if ((oldFlags & BM_TAG_VALID) &&
-				oldPartitionLock != newPartitionLock)
-				LWLockRelease(oldPartitionLock);
-
-			/* remaining code should match code at top of routine */
-
-			buf = &BufferDescriptors[buf_id];
-
-			valid = PinBuffer(buf, strategy);
-
-			/* Can release the mapping lock as soon as we've pinned it */
-			LWLockRelease(newPartitionLock);
-
 			*foundPtr = TRUE;
 
 			if (!valid)
@@ -1123,7 +1113,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 				 * then set up our own read attempt if the page is still not
 				 * BM_VALID.  StartBufferIO does it all.
 				 */
-				if (StartBufferIO(buf, true))
+				if (StartBufferIO(foundbuf, true))
 				{
 					/*
 					 * If we get here, previous attempts to read the buffer
@@ -1133,7 +1123,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 				}
 			}
 
-			return buf;
+			return foundbuf;
 		}
 
 		/*
@@ -1152,11 +1142,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 			break;
 
 		UnlockBufHdr(buf);
-		BufTableDelete(&newTag, newHash);
-		if ((oldFlags & BM_TAG_VALID) &&
-			oldPartitionLock != newPartitionLock)
-			LWLockRelease(oldPartitionLock);
-		LWLockRelease(newPartitionLock);
+		if (!CHashDelete(SharedBufHash, &newEnt.key))
+			elog(ERROR, "shared buffer hash table corrupted");
 		UnpinBuffer(buf, true);
 	}
 
@@ -1168,7 +1155,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	 * the old content is no longer relevant.  (The usage_count starts out at
 	 * 1 so that the buffer can survive one clock-sweep pass.)
 	 */
-	buf->tag = newTag;
+	buf->tag = newEnt.key;
 	buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT);
 	if (relpersistence == RELPERSISTENCE_PERMANENT)
 		buf->flags |= BM_TAG_VALID | BM_PERMANENT;
@@ -1178,14 +1165,9 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 
 	UnlockBufHdr(buf);
 
-	if (oldFlags & BM_TAG_VALID)
-	{
-		BufTableDelete(&oldTag, oldHash);
-		if (oldPartitionLock != newPartitionLock)
-			LWLockRelease(oldPartitionLock);
-	}
-
-	LWLockRelease(newPartitionLock);
+	if ((oldFlags & BM_TAG_VALID) != 0 &&
+		!CHashDelete(SharedBufHash, &oldEnt))
+		elog(ERROR, "shared buffer hash table corrupted");
 
 	/*
 	 * Buffer contents are currently invalid.  Try to get the io_in_progress
@@ -1220,42 +1202,11 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 static void
 InvalidateBuffer(volatile BufferDesc *buf)
 {
-	BufferTag	oldTag;
-	uint32		oldHash;		/* hash value for oldTag */
-	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
+	BufferLookupEnt	oldEnt;
 	BufFlags	oldFlags;
 
 	/* Save the original buffer tag before dropping the spinlock */
-	oldTag = buf->tag;
-
-	UnlockBufHdr(buf);
-
-	/*
-	 * Need to compute the old tag's hashcode and partition lock ID. XXX is it
-	 * worth storing the hashcode in BufferDesc so we need not recompute it
-	 * here?  Probably not.
-	 */
-	oldHash = BufTableHashCode(&oldTag);
-	oldPartitionLock = BufMappingPartitionLock(oldHash);
-
-retry:
-
-	/*
-	 * Acquire exclusive mapping lock in preparation for changing the buffer's
-	 * association.
-	 */
-	LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
-
-	/* Re-lock the buffer header */
-	LockBufHdr(buf);
-
-	/* If it's changed while we were waiting for lock, do nothing */
-	if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
-	{
-		UnlockBufHdr(buf);
-		LWLockRelease(oldPartitionLock);
-		return;
-	}
+	oldEnt.key = buf->tag;
 
 	/*
 	 * We assume the only reason for it to be pinned is that someone else is
@@ -1266,15 +1217,21 @@ retry:
 	 * yet done StartBufferIO, WaitIO will fall through and we'll effectively
 	 * be busy-looping here.)
 	 */
-	if (buf->refcount != 0)
+	while (buf->refcount != 0)
 	{
 		UnlockBufHdr(buf);
-		LWLockRelease(oldPartitionLock);
 		/* safety check: should definitely not be our *own* pin */
 		if (GetPrivateRefCount(buf->buf_id) > 0)
 			elog(ERROR, "buffer is pinned in InvalidateBuffer");
 		WaitIO(buf);
-		goto retry;
+		LockBufHdr(buf);
+
+		/* If it's changed while we were waiting for lock, do nothing */
+		if (!BUFFERTAGS_EQUAL(buf->tag, oldEnt.key))
+		{
+			UnlockBufHdr(buf);
+			return;
+		}
 	}
 
 	/*
@@ -1291,13 +1248,9 @@ retry:
 	/*
 	 * Remove the buffer from the lookup hashtable, if it was in there.
 	 */
-	if (oldFlags & BM_TAG_VALID)
-		BufTableDelete(&oldTag, oldHash);
-
-	/*
-	 * Done with mapping lock.
-	 */
-	LWLockRelease(oldPartitionLock);
+	if ((oldFlags & BM_TAG_VALID) != 0 &&
+		!CHashDelete(SharedBufHash, &oldEnt))
+		elog(ERROR, "shared buffer hash table corrupted");
 
 	/*
 	 * Insert the buffer at the head of the list of free buffers.
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 3add619..2410dfc 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -432,7 +432,7 @@ StrategyShmemSize(void)
 	Size		size = 0;
 
 	/* size of lookup hash table ... see comment in StrategyInitialize */
-	size = add_size(size, BufTableShmemSize(NBuffers + NUM_BUFFER_PARTITIONS));
+	size = add_size(size, BufMgrShmemSize(NBuffers + NUM_BUFFER_PARTITIONS));
 
 	/* size of the shared replacement strategy control block */
 	size = add_size(size, MAXALIGN(sizeof(BufferStrategyControl)));
@@ -462,7 +462,7 @@ StrategyInitialize(bool init)
 	 * happening in each partition concurrently, so we could need as many as
 	 * NBuffers + NUM_BUFFER_PARTITIONS entries.
 	 */
-	InitBufTable(NBuffers + NUM_BUFFER_PARTITIONS);
+	BufMgrInitShmem(NBuffers + NUM_BUFFER_PARTITIONS);
 
 	/*
 	 * Get or create the shared strategy control block
diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c
index 250e312..5f94f57 100644
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@@ -423,6 +423,29 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr)
 	return structPtr;
 }
 
+/*
+ * ShmemInitStruct -- Attach to an existing structure in shared memory.
+ */
+void *
+ShmemAttachStruct(const char *name)
+{
+	ShmemIndexEnt *result;
+	void	   *ptr;
+	bool		found;
+
+	LWLockAcquire(ShmemIndexLock, LW_SHARED);
+
+	result = (ShmemIndexEnt *)
+		hash_search(ShmemIndex, name, HASH_FIND, &found);
+	if (!found || result == NULL)
+		elog(ERROR, "shared memory structure %s not found", name);
+	ptr = result->location;
+	Assert(ptr != NULL);
+
+	LWLockRelease(ShmemIndexLock);
+
+	return ptr;
+}
 
 /*
  * Add two Size values, checking for overflow
diff --git a/src/backend/utils/hash/Makefile b/src/backend/utils/hash/Makefile
index 05d347c..5d53382 100644
--- a/src/backend/utils/hash/Makefile
+++ b/src/backend/utils/hash/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/utils/hash
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = dynahash.o hashfn.o
+OBJS = chash.o dynahash.o hashfn.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/hash/chash.c b/src/backend/utils/hash/chash.c
new file mode 100644
index 0000000..0d4dc78
--- /dev/null
+++ b/src/backend/utils/hash/chash.c
@@ -0,0 +1,1075 @@
+/*-------------------------------------------------------------------------
+ *
+ * chash.c
+ *	  concurrent hash tables
+ *
+ * A concurrent hash table stores a collection of fixed-size objects.
+ * From the point of view of this module, such objects are merely an
+ * opaque array of bytes, but the caller will typically implement them as
+ * a C "struct".  Some fixed-size, leading portion of each object is
+ * designated as the key, which must be distinct for all objects in the
+ * collection.  Since PostgreSQL's shared memory model does not permit
+ * dynamic shared-memory allocation, we preallocate shared-memory space
+ * for the maximum number of entities which can be stored (plus a few
+ * extra, for reasons that will be further explained below).  This space
+ * is allocated as a single large array called the arena, and we often
+ * refer to entities by their position in the arena rather than via an
+ * ordinary pointer.  This saves a considerable amount of memory, since
+ * most modern architectures are 64-bit and therefore use 8-byte pointers,
+ * while arena offsets can be stored in a 32-bit word.  In fact, we
+ * reserve one bit in each such word as a mark bit, so the maximum size
+ * of the arena is 2^31 elements, a restriction that does not currently
+ * appear to be problematic.  An additional advantage of this representation
+ * is that aligned 32-bit loads and stores are atomic on all architectures
+ * we support, but 64-bit loads and stores are not.
+ *
+ * When an element is inserted, we copy the data from the backend-private
+ * object supplied by the caller into one of these shared-memory entities.
+ * When the hash table is searched, the caller passes a backend-private
+ * entity with just the key filled in; if a matching element is found,
+ * data is copied from the shared memory entity into the non-key portion
+ * of the user-supplied entity.  In this way, clients of this module
+ * never use pointers into shared memory directly.
+ *
+ * As normal, we structure the hash table as an array of buckets, whose
+ * size is always a power of two, so that the low-order bytes of the
+ * hash code can be used to select a bucket.  If multiple entities has
+ * to the same bucket, we use separate chaining: each entity in the
+ * arena has an 8-byte header that stores the 4-byte arena offset of the
+ * next item in the bucket and the hash value of the entity's key.
+ * Bucket chains are maintained in order by ascending hash value and
+ * then by ascending entity key (as per memcmp) so that there is
+ * precisely one legal location at which a given new item can be inserted
+ * into a bucket.
+ *
+ * Items are inserted into buckets using compare-and-swap (CAS).  Thus, this
+ * module cannot be used on architectures where we do not have 4-byte
+ * compare-and-swap.  When an item is deleted, we first set its mark bit,
+ * which is stored within the next-pointer, again using CAS.  Once this
+ * step is completed, the node is deleted.  As a cleanup operation, we then
+ * use CAS to modify the next-pointer of the previous node to point to the
+ * node following the deleted node.  Note that, even once this cleanup
+ * operation has been performed, some other backend concurrently walking the
+ * chain might still be visiting the deleted node.  Thus, we must be certain
+ * not to overwrite the deleted node's next-pointer until all concurrent
+ * bucket scans have completed.  This means, in particular, that we cannot
+ * immediately view the deleted node as available for reuse.
+ *
+ * Instead, when a delete-marked node is removed from the bucket chain, it
+ * is added to one of many garbage lists.  There is a many-to-one mapping from
+ * buckets to garbage lists, so that the choice of bucket determines the
+ * garbage list but not visca versa.  Any process which wishes to scan a bucket
+ * must first advertise in shared memory the corresponding garbage list number.
+ * When a backend wishes to move entries from a garbage list to a free list,
+ * it must first wait (by spinning) for any backends scanning that garbage
+ * list to complete their scans.
+ *
+ * Ideally, it would be nice to make this completely lock-free, but because
+ * of the above-described choice of garbage collection algorithm, it currently
+ * isn't.  For an algorithm to be lock-free, it must be possible to suspend
+ * the execution of any number of processes for an arbitrary period of time
+ * without impeding the overall progress of the system.  For this code, that
+ * is true except when garbage collection occurs.  In that case, an insert,
+ * search, or delete operation can obstruct an inserting thread attempting to
+ * perform garbage collection for an unbounded period of time.  The algorithm
+ * could be adapted as to be completely lock-free, essentially by guaranteeing
+ * that even in the worst case no combination of processes can obstruct garbage
+ * collection to a sufficient degree as to prevent an inserter from finding
+ * an available entry in a hash table containing fewer live elements than its
+ * declared maximum capacity.  However, it's not clear that this is a good
+ * idea, because it would likely slow down operation in the non-contended
+ * case to solve a problem that we hope won't happen anyway.
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/hash/chash.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "access/hash.h"
+#include "storage/barrier.h"
+#include "storage/proc.h"
+#include "storage/shmem.h"
+#include "utils/chash.h"
+#include "utils/memutils.h"
+
+/*
+ * CHashPtr represents an offset into the arena, plus a mark bit that is
+ * used to implement concurrent deletion.
+ */
+typedef uint32 CHashPtr;
+#define InvalidCHashPtr				((uint32) -2)
+#define CHashPtrIsInvalid(x)		((x) >= InvalidCHashPtr)
+#define CHashPtrIsMarked(x)			((x) & 1)
+#define CHashPtrGetOffset(x)		((x) >> 1)
+#define CHashPtrMark(x)				((x) | 1)
+#define CHashPtrUnmark(x)			((x) & ~1)
+#define MakeCHashPtr(x)				((x) << 1)
+#define CHashMaxCapacity			CHashPtrGetOffset(InvalidCHashPtr)
+
+/*
+ * Each object stored in the hash table is represented by a CHashNode, which
+ * stores a pointer to the next item in the same bucket, and the exact hash
+ * value of the current item.  Each CHashNode is followed by space for the
+ * item itself.
+ */
+typedef struct
+{
+	CHashPtr		next;			/* arena offset of next element */
+	union
+	{
+		uint32		hashcode;		/* hash(key) */
+		CHashPtr	gcnext;			/* arena offset of next garbage item */
+	} un;
+} CHashNode;
+
+#define SizeOfCHashNode		MAXALIGN(sizeof(CHashNode))
+#define CHashNodeGetItem(x)	(((char *) x) + SizeOfCHashNode)
+
+/*
+ * CHashTableData stores all the information that we need in order to access
+ * a concurrent hash table.  We store one copy of this data in shared memory,
+ * and an additional copy in the private memory of each backend accessing the
+ * table.
+ */
+typedef struct CHashTableData
+{
+	/*
+	 * These fields do not change after initialization.
+	 */
+	CHashDescriptor	desc;			/* descriptor for this hash table */
+	uint32			bucket_mask;	/* # of buckets, minus one */
+	uint8			garbage_shift;	/* log2(# of buckets/# of garbage lists) */
+	uint8			freelist_shift;	/* log2(# of garbage lists/# freelists) */
+	uint16			nfreelists;		/* # of freelists */
+	uint32			arena_limit;	/* # of arena elements */
+	uint32			arena_stride;	/* bytes allocated per arena element */
+	CHashPtr	   *bucket;			/* array with 1 entry per bucket */
+	CHashPtr	   *extra;			/* entries for garbage and free lists */
+	char		   *arena;			/* arena */
+
+	/*
+	 * These fields will be different in each backend; the shared copy is
+	 * irrelevant.
+	 */
+	int				gc_pid;			/* PID that set gc_next */
+	uint32			gc_next;		/* next garbage list to reclaim */
+	uint64			stats[CHS_NumberOfStatistics];	/* statistics */
+} CHashTableData;
+
+/* Compute # of buckets, garbage lists, or free lists. */
+#define CHashTableNBuckets(table) \
+	((table)->bucket_mask + 1)
+#define CHashTableNGarbage(table) \
+	(CHashTableNBuckets((table)) >> (table)->garbage_shift)
+#define CHashTableNFreeLists(table) \
+	((table)->nfreelists)
+
+/*
+ * Garbage lists and free lists are interleaved to reduce cache line
+ * contention on the free lists, so the calculation of where an individual
+ * list is located is a bit complex.
+ */
+#define CHashTableGetGarbageList(table, n) \
+	(&(table)->extra[(n) + ((n) >> (table)->freelist_shift)])
+#define CHashTableGetGarbageByBucket(table, n) \
+	(CHashTableGetGarbageList((table), (n) >> (table)->garbage_shift))
+#define CHashTableGetFreeList(table, n) \
+	(&(table)->extra[(n) + (((n) + 1) << (table)->freelist_shift)])
+
+/* Access macros for arena nodes. */
+#define CHashTableGetRaw(table, offset) \
+	(AssertMacro((offset) < (table)->arena_limit), \
+	 (CHashNode *) ((table)->arena + (table)->arena_stride * (offset)))
+#define CHashTableGetNode(table, ptr) \
+	(AssertMacro(!CHashPtrIsInvalid(ptr)), \
+	 CHashTableGetRaw((table), CHashPtrGetOffset((ptr))))
+
+/* Statistics macros. */
+#define CHashTableIncrementStatistic(table, stat) \
+	((table)->stats[(stat)]++)
+
+/*
+ * Bucket scan.
+ */
+typedef struct
+{
+	CHashPtr	target;
+	CHashPtr	next;
+	CHashPtr   *pointer_to_target;
+	CHashNode  *target_node;
+	bool		found;
+} CHashScanResult;
+
+/* Human-readable statistics names. */
+char *CHashStatisticsNames[] = {
+	"searches",
+	"searches failed",
+	"inserts",
+	"inserts failed",
+	"inserts retried",
+	"deletions",
+	"deletions failed",
+	"deletions retried",
+	"scan expunges",
+	"scan expunges failed",
+	"scans restarted",
+	"cleanup scans",
+	"allocations failed",
+	"garbage enqueues retried",
+	"garbage dequeues failed",
+	"garbage collections",
+	"garbage collection spins",
+	"garbage collection reclaims skipped",
+	"garbage collection fast reclaims",
+	"garbage collection reclaims retried",
+	"<end>"
+};
+
+/* Function prototypes. */
+static CHashPtr CHashAllocate(CHashTable table);
+static CHashPtr CHashAllocateViaGC(CHashTable table);
+static void CHashAddToGarbage(CHashTable table, uint32 bucket, CHashPtr c);
+static void CHashBucketScan(CHashTable table,
+				CHashPtr *start,
+				uint32 hashcode,
+				const void *key,
+				CHashScanResult *res);
+
+/*
+ * First stage of CHashTable initialization.  We fill in all the constants
+ * here, but not the pointers.
+ */
+CHashTable
+CHashBootstrap(CHashDescriptor *desc)
+{
+	CHashTable		table;
+	uint32			bucket_shift;
+
+	/* Sanity check. */
+	Assert(!strcmp(CHashStatisticsNames[CHS_NumberOfStatistics], "<end>"));
+
+	/* Allocate table and copy descriptor. */
+	table = MemoryContextAllocZero(TopMemoryContext, sizeof(CHashTableData));
+	memcpy(&table->desc, desc, sizeof(CHashDescriptor)); 
+
+	/* Sanity checks. */
+	if (desc->capacity < 1 || desc->capacity > CHashMaxCapacity)
+		elog(ERROR, "invalid capacity for concurrent hash");
+	if (desc->key_size < 1 || desc->key_size > desc->element_size)
+		elog(ERROR, "invalid key size for concurrent hash");
+
+ 	/*
+	 * The number of buckets must be a power of two.  To avoid (as much as
+	 * possible) having to traverse long bucket chains, we aim for a load
+	 * factor <= 1.0, so this is a pretty simple calculation: we just find the
+	 * smallest power of two greater than or equal to the target capacity.
+	 */
+	bucket_shift = fls(desc->capacity) - 1;
+	table->bucket_mask = (1 << bucket_shift) - 1;
+
+	/*
+	 * We choose to have one garbage list for every 64 hash table buckets
+	 * (that is, garbage_shift = 6) unless there are fewer than 64 buckets in
+	 * total, in which case we still have a minimum of one garbage list.
+	 *
+	 * Increasing the garbage_shift would reduce the likelihood of a backend
+	 * performing garbage collection needing to wait for a backend walking a
+	 * bucket to finish its scan.  On the other hand, decreasing the garbage
+	 * shift would allow more items to be recovered in a single garbage
+	 * collection cycle.  It's not clear what the optimal value is.
+	 */
+	table->garbage_shift = Min(bucket_shift, 6);
+	table->gc_next = 0;
+	table->gc_pid = 0;
+
+	/*
+ 	 * Experimentation reveals that the free list manipulation is both one of
+ 	 * the slowest parts of this algorithm and the most vulnerable to
+ 	 * contention.  Therefore, we want to have as many free lists as possible,
+ 	 * but there's no need to have more than the number of CPU cores, so we
+ 	 * limit the number of freelists to 64.  There might be a benefit in some
+ 	 * larger limit on a really big system, but we'll cap it here pending some
+ 	 * actual test results.  We're also limited to having no more freelists
+ 	 * than we do garbage lists.
+ 	 */
+#define LOG2_MAX_FREELIST 6
+	if (bucket_shift - table->garbage_shift < LOG2_MAX_FREELIST)
+		table->freelist_shift = 0;
+	else
+		table->freelist_shift =
+			(bucket_shift - table->garbage_shift) - LOG2_MAX_FREELIST;
+	table->nfreelists =
+		1 << (bucket_shift - (table->garbage_shift + table->freelist_shift));
+
+	/*
+	 * To make garbage collection efficient, we overallocate.  Normally, we
+	 * overallocate by one-eighth, but if that would be less than 15 elements,
+	 * then we allocate 15 elements instead.  This extra capacity can actually
+	 * be used, but for best performance, it shouldn't be.  It's the caller's
+	 * responsibility to avoid this.
+	 */
+	table->arena_limit = desc->capacity;
+	if (desc->capacity < 120)
+		table->arena_limit += 15;
+	else
+		table->arena_limit += table->arena_limit / 8;
+
+	/* Each arena element must be MAXALIGN'd and include per-node space. */
+	table->arena_stride = SizeOfCHashNode + MAXALIGN(desc->element_size);
+
+	/* Zero out statistics. */
+	memset(table->stats, 0, sizeof(uint64) * CHS_NumberOfStatistics);
+
+	return table;
+}
+
+/*
+ * Estimate shared memory requirements.
+ */
+Size
+CHashEstimateSize(CHashTable table)
+{
+	Size		total_buckets;
+	Size		size;
+	Size		nbuckets = CHashTableNBuckets(table);
+	Size		ngarbage = CHashTableNGarbage(table);
+	Size		nfreelists = CHashTableNFreeLists(table);
+
+	Assert(nbuckets > 0 && ngarbage > 0 && nfreelists > 0);
+	total_buckets = add_size(nbuckets, ngarbage);
+	total_buckets = add_size(total_buckets, nfreelists);
+
+	size = MAXALIGN(sizeof(CHashTableData));
+	size = add_size(size, mul_size(sizeof(CHashPtr), total_buckets));
+	size = add_size(size, mul_size(table->arena_stride, table->arena_limit));
+
+	return size;
+}
+
+/*
+ * Create a concurrent hash table in shared memory, or attach to an existing
+ * table.
+ */
+CHashTable
+CHashInitialize(CHashTable table, CHashDescriptor *desc)
+{
+	Size	size;
+	bool	found;
+	void   *shmem;
+	uint32	i;
+	uint32	nbuckets;
+	uint32	nfreelists;
+	uint32	ngarbage;
+	uint32	nextra;
+
+	/*
+	 * If we're under the postmaster, this must be the EXEC_BACKEND case where
+	 * we need to attach to an existing shared-memory segment.
+	 */
+	if (IsUnderPostmaster)
+	{
+		void   *shmem;
+
+		Assert(table == NULL);
+		table = MemoryContextAlloc(TopMemoryContext, sizeof(CHashTableData));
+		shmem = ShmemAttachStruct(desc->shmem_name);
+		memcpy(table, shmem, sizeof(CHashTableData));
+		return table;
+	}
+
+	/*
+	 * Otherwise, the hash table should not already exist, and we must
+	 * create it.  But the table should already be bootstrapped, since we
+	 * must previously have computed its size when figuring out our shared
+	 * memory allocation.
+	 */
+	Assert(table != NULL);
+	size = CHashEstimateSize(table);
+	shmem = ShmemInitStruct(table->desc.shmem_name, size, &found);
+	Assert(!found);
+
+	/* Bucket, garbage, and freelist arrays follow table info. */
+	table->bucket = (CHashPtr *)
+		(((char *) shmem) + MAXALIGN(sizeof(CHashTableData)));
+	nbuckets = CHashTableNBuckets(table);
+	table->extra = &table->bucket[nbuckets];
+
+	/* Arena follows the various lists. */
+	ngarbage = CHashTableNGarbage(table);
+	nfreelists = CHashTableNFreeLists(table);
+	nextra = ngarbage + nfreelists;
+	table->arena = (void *) (&table->extra[nextra]);
+
+	/* Initialize all three sets of lists to empty. */
+	for (i = 0; i < nbuckets; ++i)
+		table->bucket[i] = InvalidCHashPtr;
+	for (i = 0; i < nextra; ++i)
+		table->extra[i] = InvalidCHashPtr;
+
+	/* Put all arena elements on the free lists. */
+	for (i = 0; i < table->arena_limit; ++i)
+	{
+		CHashPtr   *f = CHashTableGetFreeList(table, i % nfreelists);
+		CHashNode  *n = CHashTableGetRaw(table, i);
+
+		n->un.gcnext = *f;
+		*f = MakeCHashPtr(i);
+	}
+
+	/*
+	 * Copy table (with pointers now filled in) to shared memory.  This is
+	 * arguably unnecessary when not using EXEC_BACKEND, but we do it anyway.
+	 */
+	memcpy(shmem, table, sizeof(CHashTableData));
+
+	return table;
+}
+
+/*
+ * Search a concurrent hash table.  entry should be a block of memory large
+ * enough to hold a complete entry, with just the key portion filled in.  If
+ * a matching entry is found, this function will fill in the rest of the entry
+ * from the data in the hash table and return true.  If not, it will return
+ * false.
+ */
+bool
+CHashSearch(CHashTable table, void *entry)
+{
+	uint32	hashcode = hash_any(entry, table->desc.key_size);
+	uint32	bucket = hashcode & table->bucket_mask;
+	CHashPtr	   *b = &table->bucket[bucket];
+	CHashScanResult	scan;
+
+	/* Prevent garbage collection for this bucket. */
+	Assert(MyProc->hazard[0] == NULL);
+	MyProc->hazard[0] = CHashTableGetGarbageByBucket(table, bucket);
+	pg_memory_barrier();
+
+	/* Scan bucket and return data from any matching entry. */
+	CHashBucketScan(table, b, hashcode, entry, &scan);
+	if (scan.found)
+		memcpy(((char *) entry) + table->desc.key_size,
+			   CHashNodeGetItem(scan.target_node) + table->desc.key_size,
+			   table->desc.element_size - table->desc.key_size);
+
+	/* Allow garbage collection for this bucket. */
+	Assert(MyProc->hazard[0] != NULL);
+	pg_memory_barrier();
+	MyProc->hazard[0] = NULL;
+
+	CHashTableIncrementStatistic(table, CHS_Search);
+	if (!scan.found)
+		CHashTableIncrementStatistic(table, CHS_Search_Failed);
+	return scan.found;
+}
+
+/*
+ * Insert into a concurrent hash table.  entry should be the complete entry
+ * to be inserted.  If no duplicate entry is found in the table, this function
+ * will insert the entry and return true.  Otherwise, the duplicate entry's
+ * value will be copied into the supplied entry and the function will return
+ * false.
+ *
+ * The caller is responsible for ensuring that no inserts are performed into
+ * a hash table which is at capacity.  The behavor of such an insert is
+ * undefined (the actual behavior is that the insert may either succeed,
+ * degrading performance; or CHashAllocate may enter a tight loop until such
+ * time as an element is deleted).
+ */
+bool
+CHashInsert(CHashTable table, void *entry)
+{
+	uint32	hashcode = hash_any(entry, table->desc.key_size);
+	uint32	bucket = hashcode & table->bucket_mask;
+	CHashPtr	   *b = &table->bucket[bucket];
+	CHashPtr	new;
+	CHashNode  *nnew;
+	CHashScanResult	scan;
+
+	/*
+	 * Allocate and initialize a new entry, on the assumption that the insert
+	 * will succeed.  If it ends up failing, we must be sure to put this back
+	 * on some free list, lest it be permanently leaked.
+	 */
+	new = CHashAllocate(table);
+	nnew = CHashTableGetNode(table, new);
+	nnew->un.hashcode = hashcode;
+	memcpy(CHashNodeGetItem(nnew), entry, table->desc.element_size);
+
+	/* Prevent garbage collection for this bucket. */
+	MyProc->hazard[0] = CHashTableGetGarbageByBucket(table, bucket);
+	pg_memory_barrier();
+
+	/*
+	 * Scan the bucket.  If we don't find a match, use compare-and-swap to
+	 * insert the new node at the insert position.  If we do find a match,
+	 * return the data to the caller.
+	 */
+retry:
+	CHashBucketScan(table, b, hashcode, entry, &scan);
+	if (scan.found)
+		memcpy(((char *) entry) + table->desc.key_size,
+			   CHashNodeGetItem(scan.target_node) + table->desc.key_size,
+			   table->desc.element_size - table->desc.key_size);
+	else
+	{
+		/*
+		 * We didn't find a matching element, so use compare-and-swap to
+		 * attempt to insert the new element we've prepared.  If this fails,
+		 * someone currently inserted or deleted an element.  The correct
+		 * insertion point might have changed, or the key we're trying to
+		 * insert might now be present when it wasn't before, so we'll have
+		 * to search the bucket chain anew.
+		 *
+		 * There is a risk of starvation here, but we hope it will not happen
+		 * in practice.  Contention for inserting new elements should be
+		 * spread out pretty much evenly across N+M possible insertion points,
+		 * where N is the number of buckets and M is the number of elements
+		 * in the table.  Even for a quite modestly size table this is likely
+		 * to exceed the number of CPU cores.
+		 */
+		Assert(!CHashPtrIsMarked(scan.target));
+		nnew->next = scan.target;
+		if (!__sync_bool_compare_and_swap(scan.pointer_to_target,
+										  scan.target, new))
+		{
+			CHashTableIncrementStatistic(table, CHS_Insert_Retry);
+			goto retry;
+		}
+	}
+
+	/* Allow garbage collection for this bucket. */
+	Assert(MyProc->hazard[0] != NULL);
+	pg_memory_barrier();
+	MyProc->hazard[0] = NULL;
+
+	/*
+	 * If the insert failed, add the entry we found to the appropriate
+	 * garbage list.  We can't simply put this back on the freelist,
+	 * because that leads to an ABA problem: some other backend could
+	 * read the head of the freelist, go into the tank, and then use
+	 * CAS to pop an element.  If at that point, it finds the same
+	 * element at the head of the freelist but with a different successor,
+	 * we're hosed.  To prevent that, we ensure that elements are added
+	 * to a given freelist only after verifying that any allocations in
+	 * progress at the time we popped the freelist has completed.  This
+	 * guarantees that any allocation still in progress at the time this
+	 * element makes it back to the freelist is trying to allocate some
+	 * other node.
+	 */
+	CHashTableIncrementStatistic(table, CHS_Insert);
+	if (scan.found)
+	{
+		CHashTableIncrementStatistic(table, CHS_Insert_Failed);
+		CHashAddToGarbage(table, bucket, new);
+	}
+
+	/* The insert succeeded if and only if no duplicate was found. */
+	return !scan.found;
+}
+
+/*
+ * Delete from a concurrent hash table.  entry need only contain the key field.
+ * Returns true if we find and delete a matching key and false otherwise.
+ */
+bool
+CHashDelete(CHashTable table, void *entry)
+{
+	uint32	hashcode = hash_any(entry, table->desc.key_size);
+	uint32	bucket = hashcode & table->bucket_mask;
+	CHashPtr	   *b = &table->bucket[bucket];
+	CHashScanResult	scan;
+
+	/* Prevent garbage collection for this bucket. */
+	Assert(MyProc->hazard[0] == NULL);
+	MyProc->hazard[0] = CHashTableGetGarbageByBucket(table, bucket);
+	pg_memory_barrier();
+
+	/* Scan bucket. */
+retry:
+	CHashBucketScan(table, b, hashcode, entry, &scan);
+
+	/* If we found it, try to delete it. */
+	if (scan.found)
+	{
+		Assert(!CHashPtrIsMarked(scan.next));
+
+		/* Attempt to apply delete-mark. */
+		if (!__sync_bool_compare_and_swap(&scan.target_node->next,
+										  scan.next,
+										  CHashPtrMark(scan.next)))
+		{
+			CHashTableIncrementStatistic(table, CHS_Delete_Retry);
+			goto retry;
+		}
+
+		/* Deletion is done; attempt to remove node from list. */
+		if (__sync_bool_compare_and_swap(scan.pointer_to_target,
+										 scan.target,
+										 scan.next))
+			CHashAddToGarbage(table, bucket, scan.target);
+		else
+		{
+			CHashScanResult	cleanup_scan;
+
+			/*
+			 * If we weren't able to remove the deleted item, rescan
+			 * the bucket to make sure it's really gone.  This is just
+			 * like a regular bucket scan, except that we don't care
+			 * about the results.  We're just doing it to achieve the
+			 * side-effect of removing delete-marked nodes from the
+			 * bucket chain.
+			 */
+			CHashTableIncrementStatistic(table, CHS_Cleanup_Scan);
+			CHashBucketScan(table, b, hashcode, entry, &cleanup_scan);
+		}
+	}
+
+	/* Allow garbage collection for this bucket. */
+	Assert(MyProc->hazard[0] != NULL);
+	pg_memory_barrier();
+	MyProc->hazard[0] = NULL;
+
+	/* We're done. */
+	CHashTableIncrementStatistic(table, CHS_Delete);
+	if (!scan.found)
+		CHashTableIncrementStatistic(table, CHS_Delete_Failed);
+	return scan.found;
+}
+
+/*
+ * Provide backend-local statistics to caller.
+ */
+void
+CHashStatistics(CHashTable table, uint64 *stats)
+{
+	memcpy(stats, &table->stats, sizeof(uint64) * CHS_NumberOfStatistics);
+}
+
+/*
+ * Scan one bucket of a concurrent hash table, storing the results in a
+ * CHashResult object provided by the caller.
+ *
+ * Caller must suppress garbage collection for the target bucket before
+ * calling this function, and resume it afterwards.
+ *
+ * On return, res->found will be true if a matching item was found and false
+ * otherwise.  res->target will be a CHashPtr referencing the matching item,
+ * or the first one following the position where the matching item should have
+ * been; res->pointer_to_target will be a pointer to the memory address from
+ * which res->target was read.
+ *
+ * If res->target is not invalid, then res->target_node is a pointer to its
+ * location in memory.  If res->found, then res->next will be the next pointer
+ * of res->target_node; otherwise, it's undefined.
+ */
+static void
+CHashBucketScan(CHashTable table,
+				CHashPtr *start,
+				uint32 hashcode,
+				const void *key,
+				CHashScanResult *res)
+{
+	CHashPtr	target;
+	CHashPtr   *pointer_to_target;
+	CHashNode  *target_node = NULL;
+
+retry:
+	pointer_to_target = start;
+	target = *pointer_to_target;
+	for (;;)
+	{
+		CHashPtr	next;
+		uint32		h;
+		int			cmp;
+
+		/*
+		 * If we've reached the end of the bucket chain, stop; otherwise,
+		 * figure out the actual address of the next item.
+		 */
+		if (CHashPtrIsInvalid(target))
+		{
+			res->found = false;
+			break;
+		}
+		target_node = CHashTableGetNode(table, target);
+
+		/*
+		 * target may have been fetched from an arena entry that could be
+		 * concurrently modified, so a dependency barrier is required before
+		 * dereferencing the derived pointer.
+		 */
+		pg_read_barrier_depends();
+		next = target_node->next;
+
+		/*
+		 * For simplicity, any bucket scan, even if it's servicing a search,
+		 * will attempt to remove delete-marked items from the bucket.  This
+		 * ensures that delete-marked elements are removed from bucket chains
+		 * as quickly as possible and reduces code duplication.  See
+		 * CHashDelete for further comments about why delete-marking is
+		 * necessary and how it allows safe deletion.
+		 */
+		if (CHashPtrIsMarked(next))
+		{
+zap:
+			if (__sync_bool_compare_and_swap(pointer_to_target,
+											 target,
+											 CHashPtrUnmark(next)))
+			{
+				/*
+				 * No one else can possibly have modified target_node->next,
+				 * because such modifications are not allowed after a
+				 * delete-mark has been applied.  Thus, if we just keep
+				 * following the next pointers, we're guaranteed to visit
+				 * all non-deleted items (and possibly some deleted items)
+				 * that were present at the time we began the scan.
+				 */
+				CHashTableIncrementStatistic(table, CHS_Scan_Expunge);
+				CHashAddToGarbage(table, hashcode & table->bucket_mask,
+								  target);
+				target = CHashPtrUnmark(next);
+			}
+			else
+			{
+				/*
+				 * If the previous node has been delete-marked, we can't
+				 * further update the next-pointer, so restart the scan.
+				 * Otherwise, we know that some other backend removed one
+				 * or more deleted nodes from the bucket chain just as we
+				 * were trying to do, and we can simply continue the scan
+				 * from wherever the previous node is pointing now.  It's
+				 * possible that some concurrent inserts have happened, too,
+				 * but that's OK; we can view those as having happened "before"
+				 * whatever operation this scan is supporting.
+				 *
+				 * Although starvation is a theoretical possibility here if
+				 * we are forced to retry repeatedly, even a single retry is
+				 * vanishingly unlikely in practice.  It requires some other
+				 * backend to delete both the node that we're looking at and
+				 * the node which precedes it before we advance to the next
+				 * node.  That could certainly happen occasionally, but we'd
+				 * have to be pretty unlucky to have it happen even twice in
+				 * a row.
+				 */
+				CHashTableIncrementStatistic(table, CHS_Scan_Expunge_Fail);
+				target = *pointer_to_target;
+				if (CHashPtrIsMarked(target))
+				{
+					CHashTableIncrementStatistic(table, CHS_Scan_Restart);
+					goto retry;
+				}
+			}
+			continue;
+		}
+
+		/*
+		 * Bucket chains are kept in order, so that there is exactly one legal
+		 * point at which any given key can be inserted.  The ordering is by
+		 * hashcode first, and then by memcmp ordering of the keys involved.
+		 */
+		h = target_node->un.hashcode;
+		if (h == hashcode)
+			cmp = memcmp(CHashNodeGetItem(target_node), key,
+						 table->desc.key_size);
+		else if (h > hashcode)
+			cmp = 1;
+		else
+			cmp = -1;
+
+		/*
+		 * If cmp < 0, then we haven't yet reached the point at which we'd
+		 * expect to find the key, so we must continue the scan.  If cmp == 0,
+		 * we've found the key and can stop.  If cmp > 0, we've either passed
+		 * the point where we expect to find the key OR someone delete-marked
+		 * the item and overwrote the hashcode with a gcnext pointer.  In the
+		 * latter case we must take care not to be fooled into stopping the
+		 * scan early.
+		 */
+		if (cmp >= 0)
+		{
+			if (cmp == 0)
+			{
+				res->found = true;
+				res->next = next;
+				break;
+			}
+			else
+			{
+				/*
+				 * pg_read_barrier() prevents the reread of the next pointer
+				 * from being speculated ahead of the read of the hash value.
+				 */
+				pg_read_barrier();
+				next = target_node->next;
+				if (CHashPtrIsMarked(next))
+					goto zap;
+				res->found = false;
+				break;
+			}
+		}
+
+		/* Continue scan from next node. */
+		pointer_to_target = &target_node->next;
+		target = next;
+	}
+
+	/* Send results back to caller. */
+	res->target = target;
+	res->pointer_to_target = pointer_to_target;
+	res->target_node = target_node;
+}
+
+/*
+ * Allocate an arena slot for a new item to be inserted into a hash table.
+ *
+ * We don't want to wait until every single free-list is completely empty
+ * before beginning to garbage collect, because that could result in very
+ * fast allocation followed by a storm of garbage collection activity.
+ * It could also lead to every inserting backend ganging up on the only
+ * non-empty freelist.
+ *
+ * To avoid that, we check free lists and garbage lists in alternation.
+ * We always start with the same free list - which one is based on our
+ * backend ID - but we try to round-robin over all the available garbage
+ * lists.  Whenever we successfully garbage collect, we put the recovered
+ * items on our own free list.  In this way, if there's only one backend
+ * active, it will typically find a free buffer in the first place it looks:
+ * its own free list.  It will also settle into a pattern of garbage
+ * collecting the garbage list which it has visited least recently, which
+ * is what we want.
+ */
+static CHashPtr
+CHashAllocate(CHashTable table)
+{
+	uint32		f_current;
+	CHashPtr	new;
+
+	/* Pick a starting freelist base on our backend ID. */
+ 	f_current = ((uint32) MyBackendId) % CHashTableNFreeLists(table);
+
+	/* If this process hasn't initialized gc_next yet, do that now. */
+	if (table->gc_pid != MyProcPid)
+	{
+		table->gc_pid = MyProcPid;
+		table->gc_next = ((uint32) MyProcPid) % CHashTableNGarbage(table);
+	}
+
+	/* Loop until we allocate a buffer. */
+	for (;;)
+	{
+		CHashPtr  *b;
+
+		/*
+ 		 * Try to pop a buffer from a freelist using compare-and-swap.
+ 		 *
+ 		 * Note that this is only safe if it's impossible for the next pointer
+ 		 * of the first element on the list to change between the time when
+ 		 * we read it and the time we use CAS to pop it off the list.  This
+ 		 * means that we can't allow any element that is currently on this
+ 		 * freelist to be allocated, marked as garbage, garbage collected,
+ 		 * and returned back to this freelist before we finish the CAS.
+ 		 *
+ 		 * If we attempt to pop the free-list and fail, we retry immediately
+ 		 * with the same free-list.  This reduces the frequency with which
+ 		 * we're obliged to update our hazard pointers, which is a material
+ 		 * savings due to the associated memory barrier.
+ 		 */
+		b = CHashTableGetFreeList(table, f_current);
+		MyProc->hazard[0] = b;
+		pg_memory_barrier();
+		new = *b;
+		while (!CHashPtrIsInvalid(new))
+		{
+			CHashNode  *n = CHashTableGetNode(table, new);
+
+			/*
+			 * n is computed from table->freelist[f_current], which could
+			 * be modified by concurrent activity, so we need a dependency
+			 * barrier here.
+			 */
+			pg_read_barrier_depends();
+			if (__sync_bool_compare_and_swap(b, new, n->un.gcnext))
+				return new;
+			CHashTableIncrementStatistic(table, CHS_Allocate_Fail);
+			new = *b;
+		}
+
+		/* Attempt garbage collection. */
+		new = CHashAllocateViaGC(table);
+		if (!CHashPtrIsInvalid(new))
+			return new;
+
+		/* Advance to next freelist. */
+		f_current = (f_current + 1) % CHashTableNFreeLists(table);
+	}
+}
+
+/*
+ * Attempt to satisfy an allocation request via garbage collection.
+ */
+static CHashPtr
+CHashAllocateViaGC(CHashTable table)
+{
+	uint32		f_home;
+	CHashPtr   *b;
+	CHashPtr   *fh;
+	CHashPtr	fhead;
+	CHashPtr	garbage;
+	CHashPtr	new;
+	CHashNode  *n;
+	uint32		i;
+
+	/* Pick a target freelist based on our backend ID. */
+ 	f_home = ((uint32) MyBackendId) % CHashTableNFreeLists(table);
+	fh = CHashTableGetFreeList(table, f_home);
+
+	/* Select target garbage list. */
+	table->gc_next = (table->gc_next + 1) % CHashTableNGarbage(table);
+	b = CHashTableGetGarbageList(table, table->gc_next);
+	garbage = *b;
+
+	/* If list is empty, fail. */
+	if (CHashPtrIsInvalid(garbage))
+		return InvalidCHashPtr;
+
+	/* If we're unable to empty the list via compare-and-swap, fail. */
+	if (!__sync_bool_compare_and_swap(b, garbage, InvalidCHashPtr))
+	{
+		CHashTableIncrementStatistic(table, CHS_Garbage_Dequeue_Fail);
+		return InvalidCHashPtr;
+	}
+
+	/*
+	 * Before removing elements removed from the garbage list to the
+	 * freelist, we must wait until (1) all bucket scans that might
+	 * still see elements on the freelist as part of the bucket chain
+	 * have completed and (2) all allocations that might see an old
+	 * version of the freelist containing one of the elements to be
+	 * garbage collected have completed.
+	 *
+	 * Note: We can't begin this operation until the clearing of the
+	 * garbage list has been committed to memory, but since that was
+	 * done using an atomic operation no explicit barrier is needed
+	 * here.
+	 *
+	 * Note: We could have a "soft" version of this that merely
+	 * requeues the garbage if it's not immediately recycleable, but
+	 * it's not clear that we need such a thing.  On the flip side we
+	 * might want to eventually enter a longer sleep here, or PANIC,
+	 * but it's not clear exactly how to calibrate that.
+	 */
+	CHashTableIncrementStatistic(table, CHS_GC);
+	MyProc->hazard[0] = NULL;
+	for (i = 0; i < ProcGlobal->allProcCount; i++)
+	{
+		volatile PGPROC	*proc = &ProcGlobal->allProcs[i];
+		void	   *hazard;
+
+		hazard = proc->hazard[0];
+		if (hazard == b || hazard == fh)
+		{
+			CHashTableIncrementStatistic(table, CHS_GC_Spin);
+			do
+			{
+				hazard = proc->hazard[0];
+			} while (hazard == b || hazard == fh);
+		}
+	}
+
+	/* Remove one item from list to satisfy current allocation. */
+	new = garbage;
+	n = CHashTableGetNode(table, new);
+	pg_read_barrier_depends();
+	fhead = n->un.gcnext;
+
+	if (CHashPtrIsInvalid(fhead))
+	{
+		/*
+		 * We have reclaimed exactly node, so there's nothing to put
+		 * back on the free list.  In this case (only) we need a
+		 * memory barrier, because the reads above must complete
+		 * before we overwrite n->un.gcnext with a new hashcode.
+		 * (This is only needed when we reclaim exactly one node,
+		 * because in any other case we'll do a compare-and-swap
+		 * before returning, which implies a full barrier.)
+ 		 */
+		pg_memory_barrier();
+		CHashTableIncrementStatistic(table, CHS_GC_Reclaim_Skipped);
+	}
+	else if (__sync_bool_compare_and_swap(fh, InvalidCHashPtr, fhead))
+	{
+		/*
+ 		 * Our free list is empty, and we've succesfully pushed the
+ 		 * reclaimed nodes onto it.  So we're done.
+ 		 */
+		CHashTableIncrementStatistic(table, CHS_GC_Reclaim_Fast);
+	}
+	else
+	{
+		CHashPtr	fcurrent;
+		CHashPtr	fnext;
+		CHashPtr	oldhead;
+
+		/* Walk list of reclaimed elements to end. */
+		fcurrent = fhead;
+		for (;;)
+		{
+			n = CHashTableGetNode(table, fcurrent);
+			fnext = n->un.gcnext;
+			if (CHashPtrIsInvalid(fnext))
+				break;
+			fcurrent = fnext;
+		}
+
+		/* Push reclaimed elements onto home free list. */
+		for (;;)
+		{
+			oldhead = *fh;
+			n->un.gcnext = oldhead;
+			if (__sync_bool_compare_and_swap(fh, oldhead, fhead))
+				break;
+			CHashTableIncrementStatistic(table, CHS_GC_Reclaim_Retry);
+		}
+	}
+
+	/* Return the element we saved for ourselves. */
+	return new;
+}
+
+/*
+ * Add an arena slot to the appropriate garbage list.
+ *
+ * The next garbage collection cycle for the affected bucket will move it
+ * to the free list.  We can't do that immediately because there might be
+ * someone traversing the list who is counting on being able to follow the
+ * next pointer.  It's OK to clobber the hash value, though, since a spurious
+ * failure to match an already-deleted item shouldn't cause any problems;
+ * this is why gcnext can share space with the hash value.
+ */
+static void
+CHashAddToGarbage(CHashTable table, uint32 bucket, CHashPtr c)
+{
+	CHashPtr	g;
+	CHashNode *n;
+	CHashPtr *garbage;
+
+	n = CHashTableGetNode(table, c);
+	garbage = CHashTableGetGarbageByBucket(table, bucket);
+
+	while (1)
+	{
+		g = *garbage;
+		n->un.gcnext = g;
+		if (__sync_bool_compare_and_swap(garbage, g, c))
+			break;
+		CHashTableIncrementStatistic(table, CHS_Garbage_Enqueue_Retry);
+	}
+}
diff --git a/src/include/storage/barrier.h b/src/include/storage/barrier.h
index cd85633..dbcc0f8 100644
--- a/src/include/storage/barrier.h
+++ b/src/include/storage/barrier.h
@@ -20,4 +20,12 @@
  */
 #include "port/atomics.h"
 
+/*
+ * If dependency barriers are undefined, we define them as a no-op.  The only
+ * known platform where anything more is required is DEC Alpha.
+ */
+#if !defined(pg_read_barrier_depends)
+#define pg_read_barrier_depends()
+#endif
+
 #endif   /* BARRIER_H */
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 9b8ace5..b58af88 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -96,20 +96,6 @@ typedef struct buftag
 )
 
 /*
- * The shared buffer mapping table is partitioned to reduce contention.
- * To determine which partition lock a given tag requires, compute the tag's
- * hash code with BufTableHashCode(), then apply BufMappingPartitionLock().
- * NB: NUM_BUFFER_PARTITIONS must be a power of 2!
- */
-#define BufTableHashPartition(hashcode) \
-	((hashcode) % NUM_BUFFER_PARTITIONS)
-#define BufMappingPartitionLock(hashcode) \
-	(&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + \
-		BufTableHashPartition(hashcode)].lock)
-#define BufMappingPartitionLockByIndex(i) \
-	(&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + (i)].lock)
-
-/*
  *	BufferDesc -- shared descriptor/state data for a single shared buffer.
  *
  * Note: buf_hdr_lock must be held to examine or change the tag, flags,
@@ -196,14 +182,6 @@ extern void StrategyNotifyBgWriter(int bgwprocno);
 extern Size StrategyShmemSize(void);
 extern void StrategyInitialize(bool init);
 
-/* buf_table.c */
-extern Size BufTableShmemSize(int size);
-extern void InitBufTable(int size);
-extern uint32 BufTableHashCode(BufferTag *tagPtr);
-extern int	BufTableLookup(BufferTag *tagPtr, uint32 hashcode);
-extern int	BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id);
-extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode);
-
 /* localbuf.c */
 extern void LocalPrefetchBuffer(SMgrRelation smgr, ForkNumber forkNum,
 					BlockNumber blockNum);
@@ -215,4 +193,8 @@ extern void DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum,
 extern void DropRelFileNodeAllLocalBuffers(RelFileNode rnode);
 extern void AtEOXact_LocalBuffers(bool isCommit);
 
+/* bufmgr.c */
+extern Size BufMgrShmemSize(int size);
+extern void BufMgrInitShmem(int size);
+
 #endif   /* BUFMGR_INTERNALS_H */
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index e3c2efc..1b37447 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -144,7 +144,7 @@ extern PGDLLIMPORT LWLockPadded *MainLWLockArray;
  */
 
 /* Number of partitions of the shared buffer mapping hashtable */
-#define NUM_BUFFER_PARTITIONS  128
+#define NUM_BUFFER_PARTITIONS  0
 
 /* Number of partitions the shared lock tables are divided into */
 #define LOG2_NUM_LOCK_PARTITIONS  4
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index d194f38..8d9b4dd 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -59,6 +59,14 @@ struct XidCache
 #define		FP_LOCK_SLOTS_PER_BACKEND 16
 
 /*
+ * Some lock-free algorithms require each backend process to be able to
+ * advertise a certain number of "hazard pointers" in shared memory.
+ * Right now one per backend is enough for our purpose, but some
+ * algorithms require more.
+ */
+#define		NUM_HAZARD_POINTERS			1
+
+/*
  * Each backend has a PGPROC struct in shared memory.  There is also a list of
  * currently-unused PGPROC structs that will be reallocated to new backends.
  *
@@ -143,6 +151,12 @@ struct PGPROC
 	bool		fpVXIDLock;		/* are we holding a fast-path VXID lock? */
 	LocalTransactionId fpLocalTransactionId;	/* lxid for fast-path VXID
 												 * lock */
+
+	/*
+	 * Hazard pointers. Currently one is enough, though some algorithms
+	 * require a few more.
+	 */
+	void	   *hazard[NUM_HAZARD_POINTERS];
 };
 
 /* NOTE: "typedef struct PGPROC PGPROC" appears in storage/lock.h. */
diff --git a/src/include/storage/shmem.h b/src/include/storage/shmem.h
index c94d620..855b65e 100644
--- a/src/include/storage/shmem.h
+++ b/src/include/storage/shmem.h
@@ -40,6 +40,7 @@ extern void InitShmemIndex(void);
 extern HTAB *ShmemInitHash(const char *name, long init_size, long max_size,
 			  HASHCTL *infoP, int hash_flags);
 extern void *ShmemInitStruct(const char *name, Size size, bool *foundPtr);
+extern void *ShmemAttachStruct(const char *name);
 extern Size add_size(Size s1, Size s2);
 extern Size mul_size(Size s1, Size s2);
 
diff --git a/src/include/utils/chash.h b/src/include/utils/chash.h
new file mode 100644
index 0000000..ee0573c
--- /dev/null
+++ b/src/include/utils/chash.h
@@ -0,0 +1,69 @@
+/*-------------------------------------------------------------------------
+ *
+ * chash.h
+ *	  Concurrent shared-memory hash table.
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/utils/chash.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef CHASH_H 
+#define CHASH_H
+
+/* Everything caller must supply to set up a concurrent hash table. */
+typedef struct
+{
+	const char *shmem_name;		/* shared memory name for this hash table */
+	uint32		capacity;		/* maximum size of hash table */
+	uint16		element_size;	/* size of each element */
+	uint16		key_size;		/* size of each key */
+} CHashDescriptor;
+
+/* Concurrent hash table statistics. */
+typedef enum
+{
+	CHS_Search,					/* search */
+	CHS_Search_Failed,			/* search failed (no such key) */
+	CHS_Insert,					/* insert */
+	CHS_Insert_Failed,			/* insert failed (duplicate key) */
+	CHS_Insert_Retry,			/* insert retried (concurrent update) */
+	CHS_Delete,					/* delete */
+	CHS_Delete_Failed,			/* delete failed (no such key) */
+	CHS_Delete_Retry,			/* delete retried (concurrent update) */
+	CHS_Scan_Expunge,			/* scan expunged deleted item */
+	CHS_Scan_Expunge_Fail,		/* scan failed to expunge */
+	CHS_Scan_Restart,			/* concurrent deletes forced a scan restart */
+	CHS_Cleanup_Scan,			/* concurrent update forced a cleanup scan */
+	CHS_Allocate_Fail,			/* allocation failed to pop freelist */
+	CHS_Garbage_Enqueue_Retry,	/* enqueue on garbage list retried */
+	CHS_Garbage_Dequeue_Fail,	/* dequeue of garbage failed */
+	CHS_GC,						/* garbage collection cycle */
+	CHS_GC_Spin,				/* GC spun waiting for concurrent process */
+	CHS_GC_Reclaim_Skipped,		/* GC recovered only one item */
+	CHS_GC_Reclaim_Fast,		/* GC put garbage on freelist via fast path */
+	CHS_GC_Reclaim_Retry,		/* enqueue of garbage on freelist retried */
+	CHS_NumberOfStatistics		/* number of statistics */
+} CHashStatisticsType;
+
+/* Human-readable names for statistics. */
+extern char *CHashStatisticsNames[];
+
+/* Opaque handle for a concurrent hash table. */
+struct CHashTableData;
+typedef struct CHashTableData *CHashTable;
+
+/* Initialization functions. */
+extern CHashTable CHashBootstrap(CHashDescriptor *desc);
+extern Size CHashEstimateSize(CHashTable table);
+extern CHashTable CHashInitialize(CHashTable table, CHashDescriptor *desc);
+
+/* Accessor functions. */
+extern bool CHashInsert(CHashTable table, void *entry);
+extern bool CHashDelete(CHashTable table, void *key);
+extern bool CHashSearch(CHashTable table, void *entry);
+extern void CHashStatistics(CHashTable table, uint64 *stats);
+
+#endif   /* CHASH_H */

-- 
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] WIP: dynahash replacement for buffer table

Reply via email to