This developed a slight merge conflict. I've rebased the attached
version, and I also took the step of getting rid of buf_table.c, as I
think I proposed somewhere upthread. This avoids the overhead of
constructing a BufferTag only to copy it into a BufferLookupEnt, plus
some function calls and so forth. A quick-and-dirty test suggests
this might not have cut down on the 1-client overhead much, but I
think it's worth doing anyway: it's certainly saving a few cycles, and
I don't think it's complicating anything measurably.
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
diff --git a/contrib/Makefile b/contrib/Makefile
index 195d447..141ef0a 100644
--- a/contrib/Makefile
+++ b/contrib/Makefile
@@ -19,6 +19,7 @@ SUBDIRS = \
earthdistance \
file_fdw \
fuzzystrmatch \
+ hashtest \
hstore \
intagg \
intarray \
diff --git a/contrib/hashtest/Makefile b/contrib/hashtest/Makefile
new file mode 100644
index 0000000..3ee42f8
--- /dev/null
+++ b/contrib/hashtest/Makefile
@@ -0,0 +1,18 @@
+# contrib/hashtest/Makefile
+
+MODULE_big = hashtest
+OBJS = hashtest.o
+
+EXTENSION = hashtest
+DATA = hashtest--1.0.sql
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/hashtest
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/hashtest/hashtest--1.0.sql b/contrib/hashtest/hashtest--1.0.sql
new file mode 100644
index 0000000..e271baf
--- /dev/null
+++ b/contrib/hashtest/hashtest--1.0.sql
@@ -0,0 +1,52 @@
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION hashtest" to load this file. \quit
+
+CREATE FUNCTION chash_insert_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'chash_insert_test'
+LANGUAGE C;
+
+CREATE FUNCTION chash_search_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'chash_search_test'
+LANGUAGE C;
+
+CREATE FUNCTION chash_delete_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'chash_delete_test'
+LANGUAGE C;
+
+CREATE FUNCTION chash_concurrent_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'chash_concurrent_test'
+LANGUAGE C;
+
+CREATE FUNCTION chash_collision_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'chash_collision_test'
+LANGUAGE C;
+
+CREATE FUNCTION dynahash_insert_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'dynahash_insert_test'
+LANGUAGE C;
+
+CREATE FUNCTION dynahash_search_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'dynahash_search_test'
+LANGUAGE C;
+
+CREATE FUNCTION dynahash_delete_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'dynahash_delete_test'
+LANGUAGE C;
+
+CREATE FUNCTION dynahash_concurrent_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'dynahash_concurrent_test'
+LANGUAGE C;
+
+CREATE FUNCTION dynahash_collision_test()
+RETURNS void
+AS 'MODULE_PATHNAME', 'dynahash_collision_test'
+LANGUAGE C;
diff --git a/contrib/hashtest/hashtest.c b/contrib/hashtest/hashtest.c
new file mode 100644
index 0000000..172a5bb
--- /dev/null
+++ b/contrib/hashtest/hashtest.c
@@ -0,0 +1,527 @@
+/*-------------------------------------------------------------------------
+ * hashtest.c
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "funcapi.h"
+#include "libpq/auth.h"
+#include "lib/stringinfo.h"
+#include "miscadmin.h"
+#include "portability/instr_time.h"
+#include "storage/ipc.h"
+#include "utils/chash.h"
+
+PG_MODULE_MAGIC;
+
+void _PG_init(void);
+Datum chash_insert_test(PG_FUNCTION_ARGS);
+Datum chash_search_test(PG_FUNCTION_ARGS);
+Datum chash_delete_test(PG_FUNCTION_ARGS);
+Datum chash_concurrent_test(PG_FUNCTION_ARGS);
+Datum chash_collision_test(PG_FUNCTION_ARGS);
+Datum dynahash_insert_test(PG_FUNCTION_ARGS);
+Datum dynahash_search_test(PG_FUNCTION_ARGS);
+Datum dynahash_delete_test(PG_FUNCTION_ARGS);
+Datum dynahash_concurrent_test(PG_FUNCTION_ARGS);
+Datum dynahash_collision_test(PG_FUNCTION_ARGS);
+static void hashtest_shmem_startup(void);
+
+PG_FUNCTION_INFO_V1(chash_insert_test);
+PG_FUNCTION_INFO_V1(chash_search_test);
+PG_FUNCTION_INFO_V1(chash_delete_test);
+PG_FUNCTION_INFO_V1(chash_concurrent_test);
+PG_FUNCTION_INFO_V1(chash_collision_test);
+PG_FUNCTION_INFO_V1(dynahash_insert_test);
+PG_FUNCTION_INFO_V1(dynahash_search_test);
+PG_FUNCTION_INFO_V1(dynahash_delete_test);
+PG_FUNCTION_INFO_V1(dynahash_concurrent_test);
+PG_FUNCTION_INFO_V1(dynahash_collision_test);
+
+typedef struct
+{
+ uint32 key;
+ uint32 val;
+} hentry;
+
+static CHashDescriptor cdesc = {
+ "hashtest-chash", /* name */
+ 1048576, /* capacity */
+ sizeof(hentry), /* element size */
+ sizeof(uint32) /* key size */
+};
+
+#define DYNAHASH_PARTITIONS 16
+
+static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
+static CHashTable chash;
+static HTAB *dynahash;
+static LWLockId dynahash_lock[DYNAHASH_PARTITIONS];
+static ClientAuthentication_hook_type original_client_auth_hook = NULL;
+
+static void hashtest_client_auth_hook(Port *port, int status);
+static void chash_write_stats_to_log(int code, Datum dummy);
+
+#define dynahash_get_lock(hashcode) \
+ (dynahash_lock[(hashcode) % DYNAHASH_PARTITIONS])
+
+void
+_PG_init(void)
+{
+ Size cs;
+ Size ds;
+
+ if (!process_shared_preload_libraries_in_progress)
+ return;
+ prev_shmem_startup_hook = shmem_startup_hook;
+ shmem_startup_hook = hashtest_shmem_startup;
+ chash = CHashBootstrap(&cdesc);
+ cs = CHashEstimateSize(chash);
+ RequestAddinShmemSpace(cs);
+ ds = hash_estimate_size(cdesc.capacity, cdesc.element_size);
+ RequestAddinShmemSpace(ds);
+ elog(LOG, "chash: %u bytes; dynahash: %u bytes", (unsigned) cs,
+ (unsigned) ds);
+ RequestAddinLWLocks(DYNAHASH_PARTITIONS);
+ original_client_auth_hook = ClientAuthentication_hook;
+ ClientAuthentication_hook = hashtest_client_auth_hook;
+
+}
+
+static void
+hashtest_client_auth_hook(Port *port, int status)
+{
+ if (original_client_auth_hook)
+ original_client_auth_hook(port, status);
+ on_proc_exit(chash_write_stats_to_log, (Datum) 0);
+}
+
+static void
+chash_write_stats_to_log(int code, Datum dummy)
+{
+ uint64 stats[CHS_NumberOfStatistics];
+ CHashStatisticsType i;
+ StringInfoData buf;
+
+ CHashStatistics(chash, stats);
+ initStringInfo(&buf);
+
+ for (i = 0; i < CHS_NumberOfStatistics; ++i)
+ {
+ if (stats[i] == 0)
+ continue;
+ appendStringInfo(&buf, UINT64_FORMAT " %s; ", stats[i],
+ CHashStatisticsNames[i]);
+ }
+
+ if (buf.len > 1)
+ {
+ buf.data[buf.len-2] = '\0';
+ elog(LOG, "chash statistics: %s", buf.data);
+ }
+}
+
+static void
+hashtest_shmem_startup(void)
+{
+ HASHCTL info;
+ uint32 i;
+
+ if (prev_shmem_startup_hook)
+ prev_shmem_startup_hook();
+
+ /* Initialize concurrent hash table. */
+ chash = CHashInitialize(chash, &cdesc);
+
+ /* Initialize shared dynahash table. */
+ info.keysize = cdesc.key_size;
+ info.entrysize = cdesc.element_size;
+ info.hash = tag_hash;
+ info.num_partitions = DYNAHASH_PARTITIONS;
+
+ dynahash = ShmemInitHash("hashtest-dynahash",
+ cdesc.capacity, cdesc.capacity,
+ &info,
+ HASH_ELEM | HASH_FUNCTION | HASH_PARTITION);
+
+ for (i = 0; i < DYNAHASH_PARTITIONS; ++i)
+ dynahash_lock[i] = LWLockAssign();
+}
+
+Datum
+chash_insert_test(PG_FUNCTION_ARGS)
+{
+ uint32 i;
+ hentry e;
+
+ for (i = 0; i < 1000000; ++i)
+ {
+ bool ok;
+
+ e.key = i;
+ e.val = i * 31;
+ ok = CHashInsert(chash, &e);
+ if (!ok)
+ elog(LOG, "insert %u: failed", i);
+ ok = CHashInsert(chash, &e);
+ if (ok)
+ elog(LOG, "insert %u: worked twice", i);
+ }
+
+ PG_RETURN_VOID();
+}
+
+Datum
+chash_search_test(PG_FUNCTION_ARGS)
+{
+ uint32 i;
+ hentry e;
+
+ for (i = 0; i < 1000000; ++i)
+ {
+ bool ok;
+
+ e.key = i;
+ ok = CHashSearch(chash, &e);
+ if (!ok)
+ elog(LOG, "search %u: not found", i);
+ else if (e.val != e.key * 31)
+ elog(LOG, "search %u: found %u", i, e.val);
+ }
+
+ PG_RETURN_VOID();
+}
+
+Datum
+chash_delete_test(PG_FUNCTION_ARGS)
+{
+ uint32 i;
+ hentry e;
+
+ for (i = 0; i < 1000000; ++i)
+ {
+ bool ok;
+
+ e.key = i;
+ ok = CHashDelete(chash, &e);
+ if (!ok)
+ elog(LOG, "delete %u: not found", i);
+ ok = CHashDelete(chash, &e);
+ if (ok)
+ elog(LOG, "delete %u: found twice", i);
+ }
+
+ PG_RETURN_VOID();
+}
+
+Datum
+chash_concurrent_test(PG_FUNCTION_ARGS)
+{
+ uint32 i;
+ hentry e;
+ uint32 seed = MyProcPid << 16;
+
+ for (i = 0; i < 10000; ++i)
+ {
+ bool ok;
+
+ e.key = seed | i;
+ e.val = MyProcPid;
+ ok = CHashInsert(chash, &e);
+ if (!ok)
+ elog(LOG, "insert %u: found", i);
+ }
+
+ for (i = 0; i < 10000; ++i)
+ {
+ bool ok;
+
+ e.key = seed | i;
+ e.val = 0;
+ ok = CHashSearch(chash, &e);
+ if (!ok)
+ {
+ uint64 retry = 1;
+ elog(LOG, "search %u: not found", i);
+ while (!CHashSearch(chash, &e))
+ ++retry;
+ elog(LOG, "search %u: eventually found it after "
+ UINT64_FORMAT " retries", i, retry);
+ }
+ if (e.val != MyProcPid)
+ elog(LOG, "search %u: expected %u found %u", i, (unsigned) MyProcPid, e.val);
+ }
+
+ for (i = 0; i < 10000; ++i)
+ {
+ bool ok;
+
+ e.key = seed | i;
+ ok = CHashDelete(chash, &e);
+ if (!ok)
+ {
+ uint64 retry = 1;
+ elog(LOG, "delete %u: not found", i);
+ while (!CHashDelete(chash, &e))
+ ++retry;
+ elog(LOG, "delete %u: eventually deleted it after "
+ UINT64_FORMAT " retries", i, retry);
+ }
+ }
+
+ PG_RETURN_VOID();
+}
+
+Datum
+chash_collision_test(PG_FUNCTION_ARGS)
+{
+ uint32 i;
+ hentry e;
+
+ /* Don't stack-allocate this. */
+ static bool mine[10000];
+
+ memset(mine, 0, 10000 * sizeof(bool));
+
+ for (i = 0; i < 10000; ++i)
+ {
+ bool ok;
+
+ e.key = i;
+ e.val = MyProcPid;
+ ok = CHashInsert(chash, &e);
+ if (ok)
+ mine[i] = true;
+ }
+
+ for (i = 0; i < 10000; ++i)
+ {
+ bool ok;
+
+ if (!mine[i])
+ continue;
+ e.key = i;
+ ok = CHashSearch(chash, &e);
+ if (!ok)
+ elog(LOG, "search %u: not found", i);
+ else if (e.val != MyProcPid)
+ elog(LOG, "search %u: expected %u found %u",
+ i, (unsigned) MyProcPid, e.val);
+ ok = CHashDelete(chash, &e);
+ if (!ok)
+ elog(LOG, "delete %u: not found", i);
+ }
+
+ PG_RETURN_VOID();
+}
+
+static bool
+dynahash_insert(uint32 key, uint32 val)
+{
+ bool found;
+ uint32 hashcode;
+ hentry *e;
+ LWLockId lockid;
+
+ hashcode = get_hash_value(dynahash, (void *) &key);
+ lockid = dynahash_get_lock(hashcode);
+ LWLockAcquire(lockid, LW_EXCLUSIVE);
+ e = hash_search_with_hash_value(dynahash, (void *) &key,
+ hashcode, HASH_ENTER, &found);
+ if (!found)
+ e->val = val;
+ LWLockRelease(lockid);
+
+ return !found;
+}
+
+static bool
+dynahash_search(uint32 key, uint32 *val)
+{
+ uint32 hashcode;
+ hentry *e;
+ LWLockId lockid;
+
+ hashcode = get_hash_value(dynahash, (void *) &key);
+ lockid = dynahash_get_lock(hashcode);
+ LWLockAcquire(lockid, LW_SHARED);
+ e = hash_search_with_hash_value(dynahash, (void *) &key,
+ hashcode, HASH_FIND, NULL);
+ if (e)
+ *val = e->val;
+ LWLockRelease(lockid);
+
+ return e != NULL;
+}
+
+static bool
+dynahash_delete(uint32 key)
+{
+ uint32 hashcode;
+ hentry *e;
+ LWLockId lockid;
+
+ hashcode = get_hash_value(dynahash, (void *) &key);
+ lockid = dynahash_get_lock(hashcode);
+ LWLockAcquire(lockid, LW_EXCLUSIVE);
+ e = hash_search_with_hash_value(dynahash, (void *) &key,
+ hashcode, HASH_REMOVE, NULL);
+ LWLockRelease(lockid);
+
+ return e != NULL;
+}
+
+Datum
+dynahash_insert_test(PG_FUNCTION_ARGS)
+{
+ uint32 i;
+
+ for (i = 0; i < 1000000; ++i)
+ {
+ bool ok;
+
+ ok = dynahash_insert(i, i * 31);
+ if (!ok)
+ elog(LOG, "insert %u: failed", i);
+ ok = dynahash_insert(i, i * 31);
+ if (ok)
+ elog(LOG, "insert %u: worked twice", i);
+ }
+
+ PG_RETURN_VOID();
+}
+
+Datum
+dynahash_search_test(PG_FUNCTION_ARGS)
+{
+ uint32 i;
+
+ for (i = 0; i < 1000000; ++i)
+ {
+ bool ok;
+ uint32 val;
+
+ ok = dynahash_search(i, &val);
+ if (!ok)
+ elog(LOG, "search %u: not found", i);
+ else if (val != i* 31)
+ elog(LOG, "search %u: found %u", i, val);
+ }
+
+ PG_RETURN_VOID();
+}
+
+Datum
+dynahash_delete_test(PG_FUNCTION_ARGS)
+{
+ uint32 i;
+
+ for (i = 0; i < 1000000; ++i)
+ {
+ bool ok;
+
+ ok = dynahash_delete(i);
+ if (!ok)
+ elog(LOG, "delete %u: not found", i);
+ ok = dynahash_delete(i);
+ if (ok)
+ elog(LOG, "delete %u: found twice", i);
+ }
+
+ PG_RETURN_VOID();
+}
+
+Datum
+dynahash_concurrent_test(PG_FUNCTION_ARGS)
+{
+ uint32 i;
+ uint32 val;
+ uint32 seed = MyProcPid << 16;
+
+ for (i = 0; i < 10000; ++i)
+ {
+ bool ok;
+
+ ok = dynahash_insert(seed | i, MyProcPid);
+ if (!ok)
+ elog(LOG, "insert %u: found", i);
+ }
+
+ for (i = 0; i < 10000; ++i)
+ {
+ bool ok;
+
+ ok = dynahash_search(seed | i, &val);
+ if (!ok)
+ {
+ uint64 retry = 1;
+ elog(LOG, "search %u: not found", i);
+ while (!dynahash_search(seed | i, &val))
+ ++retry;
+ elog(LOG, "search %u: eventually found it after "
+ UINT64_FORMAT " retries", i, retry);
+ }
+ if (val != MyProcPid)
+ elog(LOG, "search %u: expected %u found %u",
+ i, (unsigned) MyProcPid, val);
+ }
+
+ for (i = 0; i < 10000; ++i)
+ {
+ bool ok;
+
+ ok = dynahash_delete(seed | i);
+ if (!ok)
+ {
+ uint64 retry = 1;
+ elog(LOG, "delete %u: not found", i);
+ while (!dynahash_delete(seed | i))
+ ++retry;
+ elog(LOG, "delete %u: eventually deleted it after "
+ UINT64_FORMAT " retries", i, retry);
+ }
+ }
+
+ PG_RETURN_VOID();
+}
+
+Datum
+dynahash_collision_test(PG_FUNCTION_ARGS)
+{
+ uint32 i;
+ uint32 val;
+
+ /* Don't stack-allocate this. */
+ static bool mine[10000];
+
+ memset(mine, 0, 10000 * sizeof(bool));
+
+ for (i = 0; i < 10000; ++i)
+ {
+ bool ok;
+
+ ok = dynahash_insert(i, MyProcPid);
+ if (ok)
+ mine[i] = true;
+ }
+
+ for (i = 0; i < 10000; ++i)
+ {
+ bool ok;
+
+ if (!mine[i])
+ continue;
+ ok = dynahash_search(i, &val);
+ if (!ok)
+ elog(LOG, "search %u: not found", i);
+ else if (val != MyProcPid)
+ elog(LOG, "search %u: expected %u found %u",
+ i, (unsigned) MyProcPid, val);
+ ok = dynahash_delete(i);
+ if (!ok)
+ elog(LOG, "delete %u: not found", i);
+ }
+
+ PG_RETURN_VOID();
+}
diff --git a/contrib/hashtest/hashtest.control b/contrib/hashtest/hashtest.control
new file mode 100644
index 0000000..b8e0f01
--- /dev/null
+++ b/contrib/hashtest/hashtest.control
@@ -0,0 +1,4 @@
+comment = 'hash testing code'
+default_version = '1.0'
+module_pathname = '$libdir/hashtest'
+relocatable = true
diff --git a/src/backend/storage/buffer/Makefile b/src/backend/storage/buffer/Makefile
index 2c10fba..b30a0da 100644
--- a/src/backend/storage/buffer/Makefile
+++ b/src/backend/storage/buffer/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/storage/buffer
top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global
-OBJS = buf_table.o buf_init.o bufmgr.o freelist.o localbuf.o
+OBJS = buf_init.o bufmgr.o freelist.o localbuf.o
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README
index a4ebbcc..86697e9 100644
--- a/src/backend/storage/buffer/README
+++ b/src/backend/storage/buffer/README
@@ -100,30 +100,10 @@ Buffer Manager's Internal Locking
Before PostgreSQL 8.1, all operations of the shared buffer manager itself
were protected by a single system-wide lock, the BufMgrLock, which
-unsurprisingly proved to be a source of contention. The new locking scheme
-avoids grabbing system-wide exclusive locks in common code paths. It works
-like this:
-
-* There is a system-wide LWLock, the BufMappingLock, that notionally
-protects the mapping from buffer tags (page identifiers) to buffers.
-(Physically, it can be thought of as protecting the hash table maintained
-by buf_table.c.) To look up whether a buffer exists for a tag, it is
-sufficient to obtain share lock on the BufMappingLock. Note that one
-must pin the found buffer, if any, before releasing the BufMappingLock.
-To alter the page assignment of any buffer, one must hold exclusive lock
-on the BufMappingLock. This lock must be held across adjusting the buffer's
-header fields and changing the buf_table hash table. The only common
-operation that needs exclusive lock is reading in a page that was not
-in shared buffers already, which will require at least a kernel call
-and usually a wait for I/O, so it will be slow anyway.
-
-* As of PG 8.2, the BufMappingLock has been split into NUM_BUFFER_PARTITIONS
-separate locks, each guarding a portion of the buffer tag space. This allows
-further reduction of contention in the normal code paths. The partition
-that a particular buffer tag belongs to is determined from the low-order
-bits of the tag's hash value. The rules stated above apply to each partition
-independently. If it is necessary to lock more than one partition at a time,
-they must be locked in partition-number order to avoid risk of deadlock.
+unsurprisingly proved to be a source of contention. In subsequent releases,
+this lock was split into NUM_BUFFER_PARTITIONS locks, each guarding a portion
+of the buffer tag space. Even this proved to be too much contention, so
+now we use a highly concurrent hashtable (see chash.c and chash.h).
* A separate system-wide spinlock, buffer_strategy_lock, provides mutual
exclusion for operations that access the buffer free list or select
diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c
deleted file mode 100644
index 6ed47d5..0000000
--- a/src/backend/storage/buffer/buf_table.c
+++ /dev/null
@@ -1,163 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * buf_table.c
- * routines for mapping BufferTags to buffer indexes.
- *
- * Note: the routines in this file do no locking of their own. The caller
- * must hold a suitable lock on the appropriate BufMappingLock, as specified
- * in the comments. We can't do the locking inside these functions because
- * in most cases the caller needs to adjust the buffer header contents
- * before the lock is released (see notes in README).
- *
- *
- * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
- * Portions Copyright (c) 1994, Regents of the University of California
- *
- *
- * IDENTIFICATION
- * src/backend/storage/buffer/buf_table.c
- *
- *-------------------------------------------------------------------------
- */
-#include "postgres.h"
-
-#include "storage/bufmgr.h"
-#include "storage/buf_internals.h"
-
-
-/* entry for buffer lookup hashtable */
-typedef struct
-{
- BufferTag key; /* Tag of a disk page */
- int id; /* Associated buffer ID */
-} BufferLookupEnt;
-
-static HTAB *SharedBufHash;
-
-
-/*
- * Estimate space needed for mapping hashtable
- * size is the desired hash table size (possibly more than NBuffers)
- */
-Size
-BufTableShmemSize(int size)
-{
- return hash_estimate_size(size, sizeof(BufferLookupEnt));
-}
-
-/*
- * Initialize shmem hash table for mapping buffers
- * size is the desired hash table size (possibly more than NBuffers)
- */
-void
-InitBufTable(int size)
-{
- HASHCTL info;
-
- /* assume no locking is needed yet */
-
- /* BufferTag maps to Buffer */
- info.keysize = sizeof(BufferTag);
- info.entrysize = sizeof(BufferLookupEnt);
- info.num_partitions = NUM_BUFFER_PARTITIONS;
-
- SharedBufHash = ShmemInitHash("Shared Buffer Lookup Table",
- size, size,
- &info,
- HASH_ELEM | HASH_BLOBS | HASH_PARTITION);
-}
-
-/*
- * BufTableHashCode
- * Compute the hash code associated with a BufferTag
- *
- * This must be passed to the lookup/insert/delete routines along with the
- * tag. We do it like this because the callers need to know the hash code
- * in order to determine which buffer partition to lock, and we don't want
- * to do the hash computation twice (hash_any is a bit slow).
- */
-uint32
-BufTableHashCode(BufferTag *tagPtr)
-{
- return get_hash_value(SharedBufHash, (void *) tagPtr);
-}
-
-/*
- * BufTableLookup
- * Lookup the given BufferTag; return buffer ID, or -1 if not found
- *
- * Caller must hold at least share lock on BufMappingLock for tag's partition
- */
-int
-BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
-{
- BufferLookupEnt *result;
-
- result = (BufferLookupEnt *)
- hash_search_with_hash_value(SharedBufHash,
- (void *) tagPtr,
- hashcode,
- HASH_FIND,
- NULL);
-
- if (!result)
- return -1;
-
- return result->id;
-}
-
-/*
- * BufTableInsert
- * Insert a hashtable entry for given tag and buffer ID,
- * unless an entry already exists for that tag
- *
- * Returns -1 on successful insertion. If a conflicting entry exists
- * already, returns the buffer ID in that entry.
- *
- * Caller must hold exclusive lock on BufMappingLock for tag's partition
- */
-int
-BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
-{
- BufferLookupEnt *result;
- bool found;
-
- Assert(buf_id >= 0); /* -1 is reserved for not-in-table */
- Assert(tagPtr->blockNum != P_NEW); /* invalid tag */
-
- result = (BufferLookupEnt *)
- hash_search_with_hash_value(SharedBufHash,
- (void *) tagPtr,
- hashcode,
- HASH_ENTER,
- &found);
-
- if (found) /* found something already in the table */
- return result->id;
-
- result->id = buf_id;
-
- return -1;
-}
-
-/*
- * BufTableDelete
- * Delete the hashtable entry for given tag (which must exist)
- *
- * Caller must hold exclusive lock on BufMappingLock for tag's partition
- */
-void
-BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
-{
- BufferLookupEnt *result;
-
- result = (BufferLookupEnt *)
- hash_search_with_hash_value(SharedBufHash,
- (void *) tagPtr,
- hashcode,
- HASH_REMOVE,
- NULL);
-
- if (!result) /* shouldn't happen */
- elog(ERROR, "shared buffer hash table corrupted");
-}
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 7eb2d22..4435b3e 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -24,9 +24,7 @@
* MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
* The disk write is delayed until buffer replacement or checkpoint.
*
- * See also these files:
- * freelist.c -- chooses victim for buffer replacement
- * buf_table.c -- manages the buffer lookup table
+ * See also freelist.c, which chooses victim for buffer replacement
*/
#include "postgres.h"
@@ -47,10 +45,25 @@
#include "storage/proc.h"
#include "storage/smgr.h"
#include "storage/standby.h"
+#include "utils/chash.h"
#include "utils/rel.h"
#include "utils/resowner_private.h"
#include "utils/timestamp.h"
+/* entry for buffer lookup hashtable */
+typedef struct
+{
+ BufferTag key; /* Tag of a disk page */
+ int id; /* Associated buffer ID */
+} BufferLookupEnt;
+
+static CHashDescriptor SharedBufDescriptor = {
+ "buffer lookup table",
+ 0,
+ sizeof(BufferLookupEnt),
+ sizeof(BufferTag)
+};
+static CHashTable SharedBufHash;
/* Note: these two macros only work on shared buffers, not local ones! */
#define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
@@ -138,6 +151,38 @@ static inline int32 GetPrivateRefCount(Buffer buffer);
static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref);
/*
+ * Estimate space needed for mapping hashtable
+ * size is the desired hash table size (possibly more than NBuffers)
+ */
+Size
+BufMgrShmemSize(int size)
+{
+ if (SharedBufHash == NULL)
+ {
+ SharedBufDescriptor.capacity = size;
+ SharedBufHash = CHashBootstrap(&SharedBufDescriptor);
+ }
+
+ return CHashEstimateSize(SharedBufHash);
+}
+
+/*
+ * Initialize shmem hash table for mapping buffers
+ * size is the desired hash table size (possibly more than NBuffers)
+ */
+void
+BufMgrInitShmem(int size)
+{
+ if (SharedBufHash == NULL || !IsUnderPostmaster)
+ {
+ Assert(SharedBufDescriptor.capacity == 0 ||
+ SharedBufDescriptor.capacity == size);
+ SharedBufDescriptor.capacity = size;
+ SharedBufHash = CHashInitialize(SharedBufHash, &SharedBufDescriptor);
+ }
+}
+
+/*
* Ensure that the the PrivateRefCountArray has sufficient space to store one
* more entry. This has to be called before using NewPrivateRefCountEntry() to
* fill a new entry - but it's perfectly fine to not use a reserved entry.
@@ -444,26 +489,14 @@ PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
}
else
{
- BufferTag newTag; /* identity of requested block */
- uint32 newHash; /* hash value for newTag */
- LWLock *newPartitionLock; /* buffer partition lock for it */
- int buf_id;
+ BufferLookupEnt ent; /* identity of requested block */
/* create a tag so we can lookup the buffer */
- INIT_BUFFERTAG(newTag, reln->rd_smgr->smgr_rnode.node,
+ INIT_BUFFERTAG(ent.key, reln->rd_smgr->smgr_rnode.node,
forkNum, blockNum);
- /* determine its hash code and partition lock ID */
- newHash = BufTableHashCode(&newTag);
- newPartitionLock = BufMappingPartitionLock(newHash);
-
- /* see if the block is in the buffer pool already */
- LWLockAcquire(newPartitionLock, LW_SHARED);
- buf_id = BufTableLookup(&newTag, newHash);
- LWLockRelease(newPartitionLock);
-
/* If not in buffers, initiate prefetch */
- if (buf_id < 0)
+ if (!CHashSearch(SharedBufHash, &ent))
smgrprefetch(reln->rd_smgr, forkNum, blockNum);
/*
@@ -870,43 +903,39 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
BufferAccessStrategy strategy,
bool *foundPtr)
{
- BufferTag newTag; /* identity of requested block */
- uint32 newHash; /* hash value for newTag */
- LWLock *newPartitionLock; /* buffer partition lock for it */
- BufferTag oldTag; /* previous identity of selected buffer */
- uint32 oldHash; /* hash value for oldTag */
- LWLock *oldPartitionLock; /* buffer partition lock for it */
+ BufferLookupEnt newEnt; /* identity of requested block */
+ BufferLookupEnt oldEnt; /* previous identity of selected buffer */
BufFlags oldFlags;
- int buf_id;
volatile BufferDesc *buf;
bool valid;
/* create a tag so we can lookup the buffer */
- INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
-
- /* determine its hash code and partition lock ID */
- newHash = BufTableHashCode(&newTag);
- newPartitionLock = BufMappingPartitionLock(newHash);
+ INIT_BUFFERTAG(newEnt.key, smgr->smgr_rnode.node, forkNum, blockNum);
/* see if the block is in the buffer pool already */
- LWLockAcquire(newPartitionLock, LW_SHARED);
- buf_id = BufTableLookup(&newTag, newHash);
- if (buf_id >= 0)
+start:
+ if (CHashSearch(SharedBufHash, &newEnt))
{
+ BufferDesc *foundbuf;
+
/*
* Found it. Now, pin the buffer so no one can steal it from the
- * buffer pool, and check to see if the correct data has been loaded
- * into the buffer.
+ * buffer pool.
*/
- buf = &BufferDescriptors[buf_id];
+ foundbuf = &BufferDescriptors[newEnt.id];
- valid = PinBuffer(buf, strategy);
+ valid = PinBuffer(foundbuf, strategy);
- /* Can release the mapping lock as soon as we've pinned it */
- LWLockRelease(newPartitionLock);
+ /* Check whether someone recycled the buffer before we pinned it. */
+ if (!BUFFERTAGS_EQUAL(newEnt.key, foundbuf->tag))
+ {
+ UnpinBuffer(foundbuf, true);
+ goto start;
+ }
*foundPtr = TRUE;
+ /* Check to see if the correct data has been loaded into the buffer. */
if (!valid)
{
/*
@@ -916,7 +945,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
* own read attempt if the page is still not BM_VALID.
* StartBufferIO does it all.
*/
- if (StartBufferIO(buf, true))
+ if (StartBufferIO(foundbuf, true))
{
/*
* If we get here, previous attempts to read the buffer must
@@ -926,15 +955,9 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
}
}
- return buf;
+ return foundbuf;
}
- /*
- * Didn't find it in the buffer pool. We'll have to initialize a new
- * buffer. Remember to unlock the mapping lock while doing the work.
- */
- LWLockRelease(newPartitionLock);
-
/* Loop here in case we have to try another victim buffer */
for (;;)
{
@@ -1041,42 +1064,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
*/
if (oldFlags & BM_TAG_VALID)
{
- /*
- * Need to compute the old tag's hashcode and partition lock ID.
- * XXX is it worth storing the hashcode in BufferDesc so we need
- * not recompute it here? Probably not.
- */
- oldTag = buf->tag;
- oldHash = BufTableHashCode(&oldTag);
- oldPartitionLock = BufMappingPartitionLock(oldHash);
-
- /*
- * Must lock the lower-numbered partition first to avoid
- * deadlocks.
- */
- if (oldPartitionLock < newPartitionLock)
- {
- LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
- LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
- }
- else if (oldPartitionLock > newPartitionLock)
- {
- LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
- LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
- }
- else
- {
- /* only one partition, only one lock */
- LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
- }
- }
- else
- {
- /* if it wasn't valid, we need only the new partition */
- LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
- /* these just keep the compiler quiet about uninit variables */
- oldHash = 0;
- oldPartitionLock = 0;
+ /* Save old tag. */
+ oldEnt.key = buf->tag;
}
/*
@@ -1086,32 +1075,33 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
* Note that we have not yet removed the hashtable entry for the old
* tag.
*/
- buf_id = BufTableInsert(&newTag, newHash, buf->buf_id);
-
- if (buf_id >= 0)
+enter:
+ newEnt.id = buf->buf_id;
+ if (!CHashInsert(SharedBufHash, &newEnt))
{
+ BufferDesc *foundbuf;
+
+ /*
+ * We've got a collision, apparently: it looks like someone else
+ * did what we were about to do. We can handle this as if we had
+ * found the buffer in the pool in the first place, but we must
+ * recheck the buffer tag after pinning it, because it could still
+ * get renamed under us.
+ */
+ foundbuf = &BufferDescriptors[newEnt.id];
+ valid = PinBuffer(foundbuf, strategy);
+ if (!BUFFERTAGS_EQUAL(newEnt.key, foundbuf->tag))
+ {
+ UnpinBuffer(foundbuf, true);
+ goto enter;
+ }
+
/*
- * Got a collision. Someone has already done what we were about to
- * do. We'll just handle this as if it were found in the buffer
- * pool in the first place. First, give up the buffer we were
- * planning to use.
+ * Collision confirmed. Give up the buffer we were planning to
+ * use.
*/
UnpinBuffer(buf, true);
- /* Can give up that buffer's mapping partition lock now */
- if ((oldFlags & BM_TAG_VALID) &&
- oldPartitionLock != newPartitionLock)
- LWLockRelease(oldPartitionLock);
-
- /* remaining code should match code at top of routine */
-
- buf = &BufferDescriptors[buf_id];
-
- valid = PinBuffer(buf, strategy);
-
- /* Can release the mapping lock as soon as we've pinned it */
- LWLockRelease(newPartitionLock);
-
*foundPtr = TRUE;
if (!valid)
@@ -1123,7 +1113,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
* then set up our own read attempt if the page is still not
* BM_VALID. StartBufferIO does it all.
*/
- if (StartBufferIO(buf, true))
+ if (StartBufferIO(foundbuf, true))
{
/*
* If we get here, previous attempts to read the buffer
@@ -1133,7 +1123,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
}
}
- return buf;
+ return foundbuf;
}
/*
@@ -1152,11 +1142,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
break;
UnlockBufHdr(buf);
- BufTableDelete(&newTag, newHash);
- if ((oldFlags & BM_TAG_VALID) &&
- oldPartitionLock != newPartitionLock)
- LWLockRelease(oldPartitionLock);
- LWLockRelease(newPartitionLock);
+ if (!CHashDelete(SharedBufHash, &newEnt.key))
+ elog(ERROR, "shared buffer hash table corrupted");
UnpinBuffer(buf, true);
}
@@ -1168,7 +1155,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
* the old content is no longer relevant. (The usage_count starts out at
* 1 so that the buffer can survive one clock-sweep pass.)
*/
- buf->tag = newTag;
+ buf->tag = newEnt.key;
buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT);
if (relpersistence == RELPERSISTENCE_PERMANENT)
buf->flags |= BM_TAG_VALID | BM_PERMANENT;
@@ -1178,14 +1165,9 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
UnlockBufHdr(buf);
- if (oldFlags & BM_TAG_VALID)
- {
- BufTableDelete(&oldTag, oldHash);
- if (oldPartitionLock != newPartitionLock)
- LWLockRelease(oldPartitionLock);
- }
-
- LWLockRelease(newPartitionLock);
+ if ((oldFlags & BM_TAG_VALID) != 0 &&
+ !CHashDelete(SharedBufHash, &oldEnt))
+ elog(ERROR, "shared buffer hash table corrupted");
/*
* Buffer contents are currently invalid. Try to get the io_in_progress
@@ -1220,42 +1202,11 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
static void
InvalidateBuffer(volatile BufferDesc *buf)
{
- BufferTag oldTag;
- uint32 oldHash; /* hash value for oldTag */
- LWLock *oldPartitionLock; /* buffer partition lock for it */
+ BufferLookupEnt oldEnt;
BufFlags oldFlags;
/* Save the original buffer tag before dropping the spinlock */
- oldTag = buf->tag;
-
- UnlockBufHdr(buf);
-
- /*
- * Need to compute the old tag's hashcode and partition lock ID. XXX is it
- * worth storing the hashcode in BufferDesc so we need not recompute it
- * here? Probably not.
- */
- oldHash = BufTableHashCode(&oldTag);
- oldPartitionLock = BufMappingPartitionLock(oldHash);
-
-retry:
-
- /*
- * Acquire exclusive mapping lock in preparation for changing the buffer's
- * association.
- */
- LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
-
- /* Re-lock the buffer header */
- LockBufHdr(buf);
-
- /* If it's changed while we were waiting for lock, do nothing */
- if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
- {
- UnlockBufHdr(buf);
- LWLockRelease(oldPartitionLock);
- return;
- }
+ oldEnt.key = buf->tag;
/*
* We assume the only reason for it to be pinned is that someone else is
@@ -1266,15 +1217,21 @@ retry:
* yet done StartBufferIO, WaitIO will fall through and we'll effectively
* be busy-looping here.)
*/
- if (buf->refcount != 0)
+ while (buf->refcount != 0)
{
UnlockBufHdr(buf);
- LWLockRelease(oldPartitionLock);
/* safety check: should definitely not be our *own* pin */
if (GetPrivateRefCount(buf->buf_id) > 0)
elog(ERROR, "buffer is pinned in InvalidateBuffer");
WaitIO(buf);
- goto retry;
+ LockBufHdr(buf);
+
+ /* If it's changed while we were waiting for lock, do nothing */
+ if (!BUFFERTAGS_EQUAL(buf->tag, oldEnt.key))
+ {
+ UnlockBufHdr(buf);
+ return;
+ }
}
/*
@@ -1291,13 +1248,9 @@ retry:
/*
* Remove the buffer from the lookup hashtable, if it was in there.
*/
- if (oldFlags & BM_TAG_VALID)
- BufTableDelete(&oldTag, oldHash);
-
- /*
- * Done with mapping lock.
- */
- LWLockRelease(oldPartitionLock);
+ if ((oldFlags & BM_TAG_VALID) != 0 &&
+ !CHashDelete(SharedBufHash, &oldEnt))
+ elog(ERROR, "shared buffer hash table corrupted");
/*
* Insert the buffer at the head of the list of free buffers.
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 3add619..2410dfc 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -432,7 +432,7 @@ StrategyShmemSize(void)
Size size = 0;
/* size of lookup hash table ... see comment in StrategyInitialize */
- size = add_size(size, BufTableShmemSize(NBuffers + NUM_BUFFER_PARTITIONS));
+ size = add_size(size, BufMgrShmemSize(NBuffers + NUM_BUFFER_PARTITIONS));
/* size of the shared replacement strategy control block */
size = add_size(size, MAXALIGN(sizeof(BufferStrategyControl)));
@@ -462,7 +462,7 @@ StrategyInitialize(bool init)
* happening in each partition concurrently, so we could need as many as
* NBuffers + NUM_BUFFER_PARTITIONS entries.
*/
- InitBufTable(NBuffers + NUM_BUFFER_PARTITIONS);
+ BufMgrInitShmem(NBuffers + NUM_BUFFER_PARTITIONS);
/*
* Get or create the shared strategy control block
diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c
index 250e312..5f94f57 100644
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@@ -423,6 +423,29 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr)
return structPtr;
}
+/*
+ * ShmemInitStruct -- Attach to an existing structure in shared memory.
+ */
+void *
+ShmemAttachStruct(const char *name)
+{
+ ShmemIndexEnt *result;
+ void *ptr;
+ bool found;
+
+ LWLockAcquire(ShmemIndexLock, LW_SHARED);
+
+ result = (ShmemIndexEnt *)
+ hash_search(ShmemIndex, name, HASH_FIND, &found);
+ if (!found || result == NULL)
+ elog(ERROR, "shared memory structure %s not found", name);
+ ptr = result->location;
+ Assert(ptr != NULL);
+
+ LWLockRelease(ShmemIndexLock);
+
+ return ptr;
+}
/*
* Add two Size values, checking for overflow
diff --git a/src/backend/utils/hash/Makefile b/src/backend/utils/hash/Makefile
index 05d347c..5d53382 100644
--- a/src/backend/utils/hash/Makefile
+++ b/src/backend/utils/hash/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/utils/hash
top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global
-OBJS = dynahash.o hashfn.o
+OBJS = chash.o dynahash.o hashfn.o
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/hash/chash.c b/src/backend/utils/hash/chash.c
new file mode 100644
index 0000000..0d4dc78
--- /dev/null
+++ b/src/backend/utils/hash/chash.c
@@ -0,0 +1,1075 @@
+/*-------------------------------------------------------------------------
+ *
+ * chash.c
+ * concurrent hash tables
+ *
+ * A concurrent hash table stores a collection of fixed-size objects.
+ * From the point of view of this module, such objects are merely an
+ * opaque array of bytes, but the caller will typically implement them as
+ * a C "struct". Some fixed-size, leading portion of each object is
+ * designated as the key, which must be distinct for all objects in the
+ * collection. Since PostgreSQL's shared memory model does not permit
+ * dynamic shared-memory allocation, we preallocate shared-memory space
+ * for the maximum number of entities which can be stored (plus a few
+ * extra, for reasons that will be further explained below). This space
+ * is allocated as a single large array called the arena, and we often
+ * refer to entities by their position in the arena rather than via an
+ * ordinary pointer. This saves a considerable amount of memory, since
+ * most modern architectures are 64-bit and therefore use 8-byte pointers,
+ * while arena offsets can be stored in a 32-bit word. In fact, we
+ * reserve one bit in each such word as a mark bit, so the maximum size
+ * of the arena is 2^31 elements, a restriction that does not currently
+ * appear to be problematic. An additional advantage of this representation
+ * is that aligned 32-bit loads and stores are atomic on all architectures
+ * we support, but 64-bit loads and stores are not.
+ *
+ * When an element is inserted, we copy the data from the backend-private
+ * object supplied by the caller into one of these shared-memory entities.
+ * When the hash table is searched, the caller passes a backend-private
+ * entity with just the key filled in; if a matching element is found,
+ * data is copied from the shared memory entity into the non-key portion
+ * of the user-supplied entity. In this way, clients of this module
+ * never use pointers into shared memory directly.
+ *
+ * As normal, we structure the hash table as an array of buckets, whose
+ * size is always a power of two, so that the low-order bytes of the
+ * hash code can be used to select a bucket. If multiple entities has
+ * to the same bucket, we use separate chaining: each entity in the
+ * arena has an 8-byte header that stores the 4-byte arena offset of the
+ * next item in the bucket and the hash value of the entity's key.
+ * Bucket chains are maintained in order by ascending hash value and
+ * then by ascending entity key (as per memcmp) so that there is
+ * precisely one legal location at which a given new item can be inserted
+ * into a bucket.
+ *
+ * Items are inserted into buckets using compare-and-swap (CAS). Thus, this
+ * module cannot be used on architectures where we do not have 4-byte
+ * compare-and-swap. When an item is deleted, we first set its mark bit,
+ * which is stored within the next-pointer, again using CAS. Once this
+ * step is completed, the node is deleted. As a cleanup operation, we then
+ * use CAS to modify the next-pointer of the previous node to point to the
+ * node following the deleted node. Note that, even once this cleanup
+ * operation has been performed, some other backend concurrently walking the
+ * chain might still be visiting the deleted node. Thus, we must be certain
+ * not to overwrite the deleted node's next-pointer until all concurrent
+ * bucket scans have completed. This means, in particular, that we cannot
+ * immediately view the deleted node as available for reuse.
+ *
+ * Instead, when a delete-marked node is removed from the bucket chain, it
+ * is added to one of many garbage lists. There is a many-to-one mapping from
+ * buckets to garbage lists, so that the choice of bucket determines the
+ * garbage list but not visca versa. Any process which wishes to scan a bucket
+ * must first advertise in shared memory the corresponding garbage list number.
+ * When a backend wishes to move entries from a garbage list to a free list,
+ * it must first wait (by spinning) for any backends scanning that garbage
+ * list to complete their scans.
+ *
+ * Ideally, it would be nice to make this completely lock-free, but because
+ * of the above-described choice of garbage collection algorithm, it currently
+ * isn't. For an algorithm to be lock-free, it must be possible to suspend
+ * the execution of any number of processes for an arbitrary period of time
+ * without impeding the overall progress of the system. For this code, that
+ * is true except when garbage collection occurs. In that case, an insert,
+ * search, or delete operation can obstruct an inserting thread attempting to
+ * perform garbage collection for an unbounded period of time. The algorithm
+ * could be adapted as to be completely lock-free, essentially by guaranteeing
+ * that even in the worst case no combination of processes can obstruct garbage
+ * collection to a sufficient degree as to prevent an inserter from finding
+ * an available entry in a hash table containing fewer live elements than its
+ * declared maximum capacity. However, it's not clear that this is a good
+ * idea, because it would likely slow down operation in the non-contended
+ * case to solve a problem that we hope won't happen anyway.
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/utils/hash/chash.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "access/hash.h"
+#include "storage/barrier.h"
+#include "storage/proc.h"
+#include "storage/shmem.h"
+#include "utils/chash.h"
+#include "utils/memutils.h"
+
+/*
+ * CHashPtr represents an offset into the arena, plus a mark bit that is
+ * used to implement concurrent deletion.
+ */
+typedef uint32 CHashPtr;
+#define InvalidCHashPtr ((uint32) -2)
+#define CHashPtrIsInvalid(x) ((x) >= InvalidCHashPtr)
+#define CHashPtrIsMarked(x) ((x) & 1)
+#define CHashPtrGetOffset(x) ((x) >> 1)
+#define CHashPtrMark(x) ((x) | 1)
+#define CHashPtrUnmark(x) ((x) & ~1)
+#define MakeCHashPtr(x) ((x) << 1)
+#define CHashMaxCapacity CHashPtrGetOffset(InvalidCHashPtr)
+
+/*
+ * Each object stored in the hash table is represented by a CHashNode, which
+ * stores a pointer to the next item in the same bucket, and the exact hash
+ * value of the current item. Each CHashNode is followed by space for the
+ * item itself.
+ */
+typedef struct
+{
+ CHashPtr next; /* arena offset of next element */
+ union
+ {
+ uint32 hashcode; /* hash(key) */
+ CHashPtr gcnext; /* arena offset of next garbage item */
+ } un;
+} CHashNode;
+
+#define SizeOfCHashNode MAXALIGN(sizeof(CHashNode))
+#define CHashNodeGetItem(x) (((char *) x) + SizeOfCHashNode)
+
+/*
+ * CHashTableData stores all the information that we need in order to access
+ * a concurrent hash table. We store one copy of this data in shared memory,
+ * and an additional copy in the private memory of each backend accessing the
+ * table.
+ */
+typedef struct CHashTableData
+{
+ /*
+ * These fields do not change after initialization.
+ */
+ CHashDescriptor desc; /* descriptor for this hash table */
+ uint32 bucket_mask; /* # of buckets, minus one */
+ uint8 garbage_shift; /* log2(# of buckets/# of garbage lists) */
+ uint8 freelist_shift; /* log2(# of garbage lists/# freelists) */
+ uint16 nfreelists; /* # of freelists */
+ uint32 arena_limit; /* # of arena elements */
+ uint32 arena_stride; /* bytes allocated per arena element */
+ CHashPtr *bucket; /* array with 1 entry per bucket */
+ CHashPtr *extra; /* entries for garbage and free lists */
+ char *arena; /* arena */
+
+ /*
+ * These fields will be different in each backend; the shared copy is
+ * irrelevant.
+ */
+ int gc_pid; /* PID that set gc_next */
+ uint32 gc_next; /* next garbage list to reclaim */
+ uint64 stats[CHS_NumberOfStatistics]; /* statistics */
+} CHashTableData;
+
+/* Compute # of buckets, garbage lists, or free lists. */
+#define CHashTableNBuckets(table) \
+ ((table)->bucket_mask + 1)
+#define CHashTableNGarbage(table) \
+ (CHashTableNBuckets((table)) >> (table)->garbage_shift)
+#define CHashTableNFreeLists(table) \
+ ((table)->nfreelists)
+
+/*
+ * Garbage lists and free lists are interleaved to reduce cache line
+ * contention on the free lists, so the calculation of where an individual
+ * list is located is a bit complex.
+ */
+#define CHashTableGetGarbageList(table, n) \
+ (&(table)->extra[(n) + ((n) >> (table)->freelist_shift)])
+#define CHashTableGetGarbageByBucket(table, n) \
+ (CHashTableGetGarbageList((table), (n) >> (table)->garbage_shift))
+#define CHashTableGetFreeList(table, n) \
+ (&(table)->extra[(n) + (((n) + 1) << (table)->freelist_shift)])
+
+/* Access macros for arena nodes. */
+#define CHashTableGetRaw(table, offset) \
+ (AssertMacro((offset) < (table)->arena_limit), \
+ (CHashNode *) ((table)->arena + (table)->arena_stride * (offset)))
+#define CHashTableGetNode(table, ptr) \
+ (AssertMacro(!CHashPtrIsInvalid(ptr)), \
+ CHashTableGetRaw((table), CHashPtrGetOffset((ptr))))
+
+/* Statistics macros. */
+#define CHashTableIncrementStatistic(table, stat) \
+ ((table)->stats[(stat)]++)
+
+/*
+ * Bucket scan.
+ */
+typedef struct
+{
+ CHashPtr target;
+ CHashPtr next;
+ CHashPtr *pointer_to_target;
+ CHashNode *target_node;
+ bool found;
+} CHashScanResult;
+
+/* Human-readable statistics names. */
+char *CHashStatisticsNames[] = {
+ "searches",
+ "searches failed",
+ "inserts",
+ "inserts failed",
+ "inserts retried",
+ "deletions",
+ "deletions failed",
+ "deletions retried",
+ "scan expunges",
+ "scan expunges failed",
+ "scans restarted",
+ "cleanup scans",
+ "allocations failed",
+ "garbage enqueues retried",
+ "garbage dequeues failed",
+ "garbage collections",
+ "garbage collection spins",
+ "garbage collection reclaims skipped",
+ "garbage collection fast reclaims",
+ "garbage collection reclaims retried",
+ "<end>"
+};
+
+/* Function prototypes. */
+static CHashPtr CHashAllocate(CHashTable table);
+static CHashPtr CHashAllocateViaGC(CHashTable table);
+static void CHashAddToGarbage(CHashTable table, uint32 bucket, CHashPtr c);
+static void CHashBucketScan(CHashTable table,
+ CHashPtr *start,
+ uint32 hashcode,
+ const void *key,
+ CHashScanResult *res);
+
+/*
+ * First stage of CHashTable initialization. We fill in all the constants
+ * here, but not the pointers.
+ */
+CHashTable
+CHashBootstrap(CHashDescriptor *desc)
+{
+ CHashTable table;
+ uint32 bucket_shift;
+
+ /* Sanity check. */
+ Assert(!strcmp(CHashStatisticsNames[CHS_NumberOfStatistics], "<end>"));
+
+ /* Allocate table and copy descriptor. */
+ table = MemoryContextAllocZero(TopMemoryContext, sizeof(CHashTableData));
+ memcpy(&table->desc, desc, sizeof(CHashDescriptor));
+
+ /* Sanity checks. */
+ if (desc->capacity < 1 || desc->capacity > CHashMaxCapacity)
+ elog(ERROR, "invalid capacity for concurrent hash");
+ if (desc->key_size < 1 || desc->key_size > desc->element_size)
+ elog(ERROR, "invalid key size for concurrent hash");
+
+ /*
+ * The number of buckets must be a power of two. To avoid (as much as
+ * possible) having to traverse long bucket chains, we aim for a load
+ * factor <= 1.0, so this is a pretty simple calculation: we just find the
+ * smallest power of two greater than or equal to the target capacity.
+ */
+ bucket_shift = fls(desc->capacity) - 1;
+ table->bucket_mask = (1 << bucket_shift) - 1;
+
+ /*
+ * We choose to have one garbage list for every 64 hash table buckets
+ * (that is, garbage_shift = 6) unless there are fewer than 64 buckets in
+ * total, in which case we still have a minimum of one garbage list.
+ *
+ * Increasing the garbage_shift would reduce the likelihood of a backend
+ * performing garbage collection needing to wait for a backend walking a
+ * bucket to finish its scan. On the other hand, decreasing the garbage
+ * shift would allow more items to be recovered in a single garbage
+ * collection cycle. It's not clear what the optimal value is.
+ */
+ table->garbage_shift = Min(bucket_shift, 6);
+ table->gc_next = 0;
+ table->gc_pid = 0;
+
+ /*
+ * Experimentation reveals that the free list manipulation is both one of
+ * the slowest parts of this algorithm and the most vulnerable to
+ * contention. Therefore, we want to have as many free lists as possible,
+ * but there's no need to have more than the number of CPU cores, so we
+ * limit the number of freelists to 64. There might be a benefit in some
+ * larger limit on a really big system, but we'll cap it here pending some
+ * actual test results. We're also limited to having no more freelists
+ * than we do garbage lists.
+ */
+#define LOG2_MAX_FREELIST 6
+ if (bucket_shift - table->garbage_shift < LOG2_MAX_FREELIST)
+ table->freelist_shift = 0;
+ else
+ table->freelist_shift =
+ (bucket_shift - table->garbage_shift) - LOG2_MAX_FREELIST;
+ table->nfreelists =
+ 1 << (bucket_shift - (table->garbage_shift + table->freelist_shift));
+
+ /*
+ * To make garbage collection efficient, we overallocate. Normally, we
+ * overallocate by one-eighth, but if that would be less than 15 elements,
+ * then we allocate 15 elements instead. This extra capacity can actually
+ * be used, but for best performance, it shouldn't be. It's the caller's
+ * responsibility to avoid this.
+ */
+ table->arena_limit = desc->capacity;
+ if (desc->capacity < 120)
+ table->arena_limit += 15;
+ else
+ table->arena_limit += table->arena_limit / 8;
+
+ /* Each arena element must be MAXALIGN'd and include per-node space. */
+ table->arena_stride = SizeOfCHashNode + MAXALIGN(desc->element_size);
+
+ /* Zero out statistics. */
+ memset(table->stats, 0, sizeof(uint64) * CHS_NumberOfStatistics);
+
+ return table;
+}
+
+/*
+ * Estimate shared memory requirements.
+ */
+Size
+CHashEstimateSize(CHashTable table)
+{
+ Size total_buckets;
+ Size size;
+ Size nbuckets = CHashTableNBuckets(table);
+ Size ngarbage = CHashTableNGarbage(table);
+ Size nfreelists = CHashTableNFreeLists(table);
+
+ Assert(nbuckets > 0 && ngarbage > 0 && nfreelists > 0);
+ total_buckets = add_size(nbuckets, ngarbage);
+ total_buckets = add_size(total_buckets, nfreelists);
+
+ size = MAXALIGN(sizeof(CHashTableData));
+ size = add_size(size, mul_size(sizeof(CHashPtr), total_buckets));
+ size = add_size(size, mul_size(table->arena_stride, table->arena_limit));
+
+ return size;
+}
+
+/*
+ * Create a concurrent hash table in shared memory, or attach to an existing
+ * table.
+ */
+CHashTable
+CHashInitialize(CHashTable table, CHashDescriptor *desc)
+{
+ Size size;
+ bool found;
+ void *shmem;
+ uint32 i;
+ uint32 nbuckets;
+ uint32 nfreelists;
+ uint32 ngarbage;
+ uint32 nextra;
+
+ /*
+ * If we're under the postmaster, this must be the EXEC_BACKEND case where
+ * we need to attach to an existing shared-memory segment.
+ */
+ if (IsUnderPostmaster)
+ {
+ void *shmem;
+
+ Assert(table == NULL);
+ table = MemoryContextAlloc(TopMemoryContext, sizeof(CHashTableData));
+ shmem = ShmemAttachStruct(desc->shmem_name);
+ memcpy(table, shmem, sizeof(CHashTableData));
+ return table;
+ }
+
+ /*
+ * Otherwise, the hash table should not already exist, and we must
+ * create it. But the table should already be bootstrapped, since we
+ * must previously have computed its size when figuring out our shared
+ * memory allocation.
+ */
+ Assert(table != NULL);
+ size = CHashEstimateSize(table);
+ shmem = ShmemInitStruct(table->desc.shmem_name, size, &found);
+ Assert(!found);
+
+ /* Bucket, garbage, and freelist arrays follow table info. */
+ table->bucket = (CHashPtr *)
+ (((char *) shmem) + MAXALIGN(sizeof(CHashTableData)));
+ nbuckets = CHashTableNBuckets(table);
+ table->extra = &table->bucket[nbuckets];
+
+ /* Arena follows the various lists. */
+ ngarbage = CHashTableNGarbage(table);
+ nfreelists = CHashTableNFreeLists(table);
+ nextra = ngarbage + nfreelists;
+ table->arena = (void *) (&table->extra[nextra]);
+
+ /* Initialize all three sets of lists to empty. */
+ for (i = 0; i < nbuckets; ++i)
+ table->bucket[i] = InvalidCHashPtr;
+ for (i = 0; i < nextra; ++i)
+ table->extra[i] = InvalidCHashPtr;
+
+ /* Put all arena elements on the free lists. */
+ for (i = 0; i < table->arena_limit; ++i)
+ {
+ CHashPtr *f = CHashTableGetFreeList(table, i % nfreelists);
+ CHashNode *n = CHashTableGetRaw(table, i);
+
+ n->un.gcnext = *f;
+ *f = MakeCHashPtr(i);
+ }
+
+ /*
+ * Copy table (with pointers now filled in) to shared memory. This is
+ * arguably unnecessary when not using EXEC_BACKEND, but we do it anyway.
+ */
+ memcpy(shmem, table, sizeof(CHashTableData));
+
+ return table;
+}
+
+/*
+ * Search a concurrent hash table. entry should be a block of memory large
+ * enough to hold a complete entry, with just the key portion filled in. If
+ * a matching entry is found, this function will fill in the rest of the entry
+ * from the data in the hash table and return true. If not, it will return
+ * false.
+ */
+bool
+CHashSearch(CHashTable table, void *entry)
+{
+ uint32 hashcode = hash_any(entry, table->desc.key_size);
+ uint32 bucket = hashcode & table->bucket_mask;
+ CHashPtr *b = &table->bucket[bucket];
+ CHashScanResult scan;
+
+ /* Prevent garbage collection for this bucket. */
+ Assert(MyProc->hazard[0] == NULL);
+ MyProc->hazard[0] = CHashTableGetGarbageByBucket(table, bucket);
+ pg_memory_barrier();
+
+ /* Scan bucket and return data from any matching entry. */
+ CHashBucketScan(table, b, hashcode, entry, &scan);
+ if (scan.found)
+ memcpy(((char *) entry) + table->desc.key_size,
+ CHashNodeGetItem(scan.target_node) + table->desc.key_size,
+ table->desc.element_size - table->desc.key_size);
+
+ /* Allow garbage collection for this bucket. */
+ Assert(MyProc->hazard[0] != NULL);
+ pg_memory_barrier();
+ MyProc->hazard[0] = NULL;
+
+ CHashTableIncrementStatistic(table, CHS_Search);
+ if (!scan.found)
+ CHashTableIncrementStatistic(table, CHS_Search_Failed);
+ return scan.found;
+}
+
+/*
+ * Insert into a concurrent hash table. entry should be the complete entry
+ * to be inserted. If no duplicate entry is found in the table, this function
+ * will insert the entry and return true. Otherwise, the duplicate entry's
+ * value will be copied into the supplied entry and the function will return
+ * false.
+ *
+ * The caller is responsible for ensuring that no inserts are performed into
+ * a hash table which is at capacity. The behavor of such an insert is
+ * undefined (the actual behavior is that the insert may either succeed,
+ * degrading performance; or CHashAllocate may enter a tight loop until such
+ * time as an element is deleted).
+ */
+bool
+CHashInsert(CHashTable table, void *entry)
+{
+ uint32 hashcode = hash_any(entry, table->desc.key_size);
+ uint32 bucket = hashcode & table->bucket_mask;
+ CHashPtr *b = &table->bucket[bucket];
+ CHashPtr new;
+ CHashNode *nnew;
+ CHashScanResult scan;
+
+ /*
+ * Allocate and initialize a new entry, on the assumption that the insert
+ * will succeed. If it ends up failing, we must be sure to put this back
+ * on some free list, lest it be permanently leaked.
+ */
+ new = CHashAllocate(table);
+ nnew = CHashTableGetNode(table, new);
+ nnew->un.hashcode = hashcode;
+ memcpy(CHashNodeGetItem(nnew), entry, table->desc.element_size);
+
+ /* Prevent garbage collection for this bucket. */
+ MyProc->hazard[0] = CHashTableGetGarbageByBucket(table, bucket);
+ pg_memory_barrier();
+
+ /*
+ * Scan the bucket. If we don't find a match, use compare-and-swap to
+ * insert the new node at the insert position. If we do find a match,
+ * return the data to the caller.
+ */
+retry:
+ CHashBucketScan(table, b, hashcode, entry, &scan);
+ if (scan.found)
+ memcpy(((char *) entry) + table->desc.key_size,
+ CHashNodeGetItem(scan.target_node) + table->desc.key_size,
+ table->desc.element_size - table->desc.key_size);
+ else
+ {
+ /*
+ * We didn't find a matching element, so use compare-and-swap to
+ * attempt to insert the new element we've prepared. If this fails,
+ * someone currently inserted or deleted an element. The correct
+ * insertion point might have changed, or the key we're trying to
+ * insert might now be present when it wasn't before, so we'll have
+ * to search the bucket chain anew.
+ *
+ * There is a risk of starvation here, but we hope it will not happen
+ * in practice. Contention for inserting new elements should be
+ * spread out pretty much evenly across N+M possible insertion points,
+ * where N is the number of buckets and M is the number of elements
+ * in the table. Even for a quite modestly size table this is likely
+ * to exceed the number of CPU cores.
+ */
+ Assert(!CHashPtrIsMarked(scan.target));
+ nnew->next = scan.target;
+ if (!__sync_bool_compare_and_swap(scan.pointer_to_target,
+ scan.target, new))
+ {
+ CHashTableIncrementStatistic(table, CHS_Insert_Retry);
+ goto retry;
+ }
+ }
+
+ /* Allow garbage collection for this bucket. */
+ Assert(MyProc->hazard[0] != NULL);
+ pg_memory_barrier();
+ MyProc->hazard[0] = NULL;
+
+ /*
+ * If the insert failed, add the entry we found to the appropriate
+ * garbage list. We can't simply put this back on the freelist,
+ * because that leads to an ABA problem: some other backend could
+ * read the head of the freelist, go into the tank, and then use
+ * CAS to pop an element. If at that point, it finds the same
+ * element at the head of the freelist but with a different successor,
+ * we're hosed. To prevent that, we ensure that elements are added
+ * to a given freelist only after verifying that any allocations in
+ * progress at the time we popped the freelist has completed. This
+ * guarantees that any allocation still in progress at the time this
+ * element makes it back to the freelist is trying to allocate some
+ * other node.
+ */
+ CHashTableIncrementStatistic(table, CHS_Insert);
+ if (scan.found)
+ {
+ CHashTableIncrementStatistic(table, CHS_Insert_Failed);
+ CHashAddToGarbage(table, bucket, new);
+ }
+
+ /* The insert succeeded if and only if no duplicate was found. */
+ return !scan.found;
+}
+
+/*
+ * Delete from a concurrent hash table. entry need only contain the key field.
+ * Returns true if we find and delete a matching key and false otherwise.
+ */
+bool
+CHashDelete(CHashTable table, void *entry)
+{
+ uint32 hashcode = hash_any(entry, table->desc.key_size);
+ uint32 bucket = hashcode & table->bucket_mask;
+ CHashPtr *b = &table->bucket[bucket];
+ CHashScanResult scan;
+
+ /* Prevent garbage collection for this bucket. */
+ Assert(MyProc->hazard[0] == NULL);
+ MyProc->hazard[0] = CHashTableGetGarbageByBucket(table, bucket);
+ pg_memory_barrier();
+
+ /* Scan bucket. */
+retry:
+ CHashBucketScan(table, b, hashcode, entry, &scan);
+
+ /* If we found it, try to delete it. */
+ if (scan.found)
+ {
+ Assert(!CHashPtrIsMarked(scan.next));
+
+ /* Attempt to apply delete-mark. */
+ if (!__sync_bool_compare_and_swap(&scan.target_node->next,
+ scan.next,
+ CHashPtrMark(scan.next)))
+ {
+ CHashTableIncrementStatistic(table, CHS_Delete_Retry);
+ goto retry;
+ }
+
+ /* Deletion is done; attempt to remove node from list. */
+ if (__sync_bool_compare_and_swap(scan.pointer_to_target,
+ scan.target,
+ scan.next))
+ CHashAddToGarbage(table, bucket, scan.target);
+ else
+ {
+ CHashScanResult cleanup_scan;
+
+ /*
+ * If we weren't able to remove the deleted item, rescan
+ * the bucket to make sure it's really gone. This is just
+ * like a regular bucket scan, except that we don't care
+ * about the results. We're just doing it to achieve the
+ * side-effect of removing delete-marked nodes from the
+ * bucket chain.
+ */
+ CHashTableIncrementStatistic(table, CHS_Cleanup_Scan);
+ CHashBucketScan(table, b, hashcode, entry, &cleanup_scan);
+ }
+ }
+
+ /* Allow garbage collection for this bucket. */
+ Assert(MyProc->hazard[0] != NULL);
+ pg_memory_barrier();
+ MyProc->hazard[0] = NULL;
+
+ /* We're done. */
+ CHashTableIncrementStatistic(table, CHS_Delete);
+ if (!scan.found)
+ CHashTableIncrementStatistic(table, CHS_Delete_Failed);
+ return scan.found;
+}
+
+/*
+ * Provide backend-local statistics to caller.
+ */
+void
+CHashStatistics(CHashTable table, uint64 *stats)
+{
+ memcpy(stats, &table->stats, sizeof(uint64) * CHS_NumberOfStatistics);
+}
+
+/*
+ * Scan one bucket of a concurrent hash table, storing the results in a
+ * CHashResult object provided by the caller.
+ *
+ * Caller must suppress garbage collection for the target bucket before
+ * calling this function, and resume it afterwards.
+ *
+ * On return, res->found will be true if a matching item was found and false
+ * otherwise. res->target will be a CHashPtr referencing the matching item,
+ * or the first one following the position where the matching item should have
+ * been; res->pointer_to_target will be a pointer to the memory address from
+ * which res->target was read.
+ *
+ * If res->target is not invalid, then res->target_node is a pointer to its
+ * location in memory. If res->found, then res->next will be the next pointer
+ * of res->target_node; otherwise, it's undefined.
+ */
+static void
+CHashBucketScan(CHashTable table,
+ CHashPtr *start,
+ uint32 hashcode,
+ const void *key,
+ CHashScanResult *res)
+{
+ CHashPtr target;
+ CHashPtr *pointer_to_target;
+ CHashNode *target_node = NULL;
+
+retry:
+ pointer_to_target = start;
+ target = *pointer_to_target;
+ for (;;)
+ {
+ CHashPtr next;
+ uint32 h;
+ int cmp;
+
+ /*
+ * If we've reached the end of the bucket chain, stop; otherwise,
+ * figure out the actual address of the next item.
+ */
+ if (CHashPtrIsInvalid(target))
+ {
+ res->found = false;
+ break;
+ }
+ target_node = CHashTableGetNode(table, target);
+
+ /*
+ * target may have been fetched from an arena entry that could be
+ * concurrently modified, so a dependency barrier is required before
+ * dereferencing the derived pointer.
+ */
+ pg_read_barrier_depends();
+ next = target_node->next;
+
+ /*
+ * For simplicity, any bucket scan, even if it's servicing a search,
+ * will attempt to remove delete-marked items from the bucket. This
+ * ensures that delete-marked elements are removed from bucket chains
+ * as quickly as possible and reduces code duplication. See
+ * CHashDelete for further comments about why delete-marking is
+ * necessary and how it allows safe deletion.
+ */
+ if (CHashPtrIsMarked(next))
+ {
+zap:
+ if (__sync_bool_compare_and_swap(pointer_to_target,
+ target,
+ CHashPtrUnmark(next)))
+ {
+ /*
+ * No one else can possibly have modified target_node->next,
+ * because such modifications are not allowed after a
+ * delete-mark has been applied. Thus, if we just keep
+ * following the next pointers, we're guaranteed to visit
+ * all non-deleted items (and possibly some deleted items)
+ * that were present at the time we began the scan.
+ */
+ CHashTableIncrementStatistic(table, CHS_Scan_Expunge);
+ CHashAddToGarbage(table, hashcode & table->bucket_mask,
+ target);
+ target = CHashPtrUnmark(next);
+ }
+ else
+ {
+ /*
+ * If the previous node has been delete-marked, we can't
+ * further update the next-pointer, so restart the scan.
+ * Otherwise, we know that some other backend removed one
+ * or more deleted nodes from the bucket chain just as we
+ * were trying to do, and we can simply continue the scan
+ * from wherever the previous node is pointing now. It's
+ * possible that some concurrent inserts have happened, too,
+ * but that's OK; we can view those as having happened "before"
+ * whatever operation this scan is supporting.
+ *
+ * Although starvation is a theoretical possibility here if
+ * we are forced to retry repeatedly, even a single retry is
+ * vanishingly unlikely in practice. It requires some other
+ * backend to delete both the node that we're looking at and
+ * the node which precedes it before we advance to the next
+ * node. That could certainly happen occasionally, but we'd
+ * have to be pretty unlucky to have it happen even twice in
+ * a row.
+ */
+ CHashTableIncrementStatistic(table, CHS_Scan_Expunge_Fail);
+ target = *pointer_to_target;
+ if (CHashPtrIsMarked(target))
+ {
+ CHashTableIncrementStatistic(table, CHS_Scan_Restart);
+ goto retry;
+ }
+ }
+ continue;
+ }
+
+ /*
+ * Bucket chains are kept in order, so that there is exactly one legal
+ * point at which any given key can be inserted. The ordering is by
+ * hashcode first, and then by memcmp ordering of the keys involved.
+ */
+ h = target_node->un.hashcode;
+ if (h == hashcode)
+ cmp = memcmp(CHashNodeGetItem(target_node), key,
+ table->desc.key_size);
+ else if (h > hashcode)
+ cmp = 1;
+ else
+ cmp = -1;
+
+ /*
+ * If cmp < 0, then we haven't yet reached the point at which we'd
+ * expect to find the key, so we must continue the scan. If cmp == 0,
+ * we've found the key and can stop. If cmp > 0, we've either passed
+ * the point where we expect to find the key OR someone delete-marked
+ * the item and overwrote the hashcode with a gcnext pointer. In the
+ * latter case we must take care not to be fooled into stopping the
+ * scan early.
+ */
+ if (cmp >= 0)
+ {
+ if (cmp == 0)
+ {
+ res->found = true;
+ res->next = next;
+ break;
+ }
+ else
+ {
+ /*
+ * pg_read_barrier() prevents the reread of the next pointer
+ * from being speculated ahead of the read of the hash value.
+ */
+ pg_read_barrier();
+ next = target_node->next;
+ if (CHashPtrIsMarked(next))
+ goto zap;
+ res->found = false;
+ break;
+ }
+ }
+
+ /* Continue scan from next node. */
+ pointer_to_target = &target_node->next;
+ target = next;
+ }
+
+ /* Send results back to caller. */
+ res->target = target;
+ res->pointer_to_target = pointer_to_target;
+ res->target_node = target_node;
+}
+
+/*
+ * Allocate an arena slot for a new item to be inserted into a hash table.
+ *
+ * We don't want to wait until every single free-list is completely empty
+ * before beginning to garbage collect, because that could result in very
+ * fast allocation followed by a storm of garbage collection activity.
+ * It could also lead to every inserting backend ganging up on the only
+ * non-empty freelist.
+ *
+ * To avoid that, we check free lists and garbage lists in alternation.
+ * We always start with the same free list - which one is based on our
+ * backend ID - but we try to round-robin over all the available garbage
+ * lists. Whenever we successfully garbage collect, we put the recovered
+ * items on our own free list. In this way, if there's only one backend
+ * active, it will typically find a free buffer in the first place it looks:
+ * its own free list. It will also settle into a pattern of garbage
+ * collecting the garbage list which it has visited least recently, which
+ * is what we want.
+ */
+static CHashPtr
+CHashAllocate(CHashTable table)
+{
+ uint32 f_current;
+ CHashPtr new;
+
+ /* Pick a starting freelist base on our backend ID. */
+ f_current = ((uint32) MyBackendId) % CHashTableNFreeLists(table);
+
+ /* If this process hasn't initialized gc_next yet, do that now. */
+ if (table->gc_pid != MyProcPid)
+ {
+ table->gc_pid = MyProcPid;
+ table->gc_next = ((uint32) MyProcPid) % CHashTableNGarbage(table);
+ }
+
+ /* Loop until we allocate a buffer. */
+ for (;;)
+ {
+ CHashPtr *b;
+
+ /*
+ * Try to pop a buffer from a freelist using compare-and-swap.
+ *
+ * Note that this is only safe if it's impossible for the next pointer
+ * of the first element on the list to change between the time when
+ * we read it and the time we use CAS to pop it off the list. This
+ * means that we can't allow any element that is currently on this
+ * freelist to be allocated, marked as garbage, garbage collected,
+ * and returned back to this freelist before we finish the CAS.
+ *
+ * If we attempt to pop the free-list and fail, we retry immediately
+ * with the same free-list. This reduces the frequency with which
+ * we're obliged to update our hazard pointers, which is a material
+ * savings due to the associated memory barrier.
+ */
+ b = CHashTableGetFreeList(table, f_current);
+ MyProc->hazard[0] = b;
+ pg_memory_barrier();
+ new = *b;
+ while (!CHashPtrIsInvalid(new))
+ {
+ CHashNode *n = CHashTableGetNode(table, new);
+
+ /*
+ * n is computed from table->freelist[f_current], which could
+ * be modified by concurrent activity, so we need a dependency
+ * barrier here.
+ */
+ pg_read_barrier_depends();
+ if (__sync_bool_compare_and_swap(b, new, n->un.gcnext))
+ return new;
+ CHashTableIncrementStatistic(table, CHS_Allocate_Fail);
+ new = *b;
+ }
+
+ /* Attempt garbage collection. */
+ new = CHashAllocateViaGC(table);
+ if (!CHashPtrIsInvalid(new))
+ return new;
+
+ /* Advance to next freelist. */
+ f_current = (f_current + 1) % CHashTableNFreeLists(table);
+ }
+}
+
+/*
+ * Attempt to satisfy an allocation request via garbage collection.
+ */
+static CHashPtr
+CHashAllocateViaGC(CHashTable table)
+{
+ uint32 f_home;
+ CHashPtr *b;
+ CHashPtr *fh;
+ CHashPtr fhead;
+ CHashPtr garbage;
+ CHashPtr new;
+ CHashNode *n;
+ uint32 i;
+
+ /* Pick a target freelist based on our backend ID. */
+ f_home = ((uint32) MyBackendId) % CHashTableNFreeLists(table);
+ fh = CHashTableGetFreeList(table, f_home);
+
+ /* Select target garbage list. */
+ table->gc_next = (table->gc_next + 1) % CHashTableNGarbage(table);
+ b = CHashTableGetGarbageList(table, table->gc_next);
+ garbage = *b;
+
+ /* If list is empty, fail. */
+ if (CHashPtrIsInvalid(garbage))
+ return InvalidCHashPtr;
+
+ /* If we're unable to empty the list via compare-and-swap, fail. */
+ if (!__sync_bool_compare_and_swap(b, garbage, InvalidCHashPtr))
+ {
+ CHashTableIncrementStatistic(table, CHS_Garbage_Dequeue_Fail);
+ return InvalidCHashPtr;
+ }
+
+ /*
+ * Before removing elements removed from the garbage list to the
+ * freelist, we must wait until (1) all bucket scans that might
+ * still see elements on the freelist as part of the bucket chain
+ * have completed and (2) all allocations that might see an old
+ * version of the freelist containing one of the elements to be
+ * garbage collected have completed.
+ *
+ * Note: We can't begin this operation until the clearing of the
+ * garbage list has been committed to memory, but since that was
+ * done using an atomic operation no explicit barrier is needed
+ * here.
+ *
+ * Note: We could have a "soft" version of this that merely
+ * requeues the garbage if it's not immediately recycleable, but
+ * it's not clear that we need such a thing. On the flip side we
+ * might want to eventually enter a longer sleep here, or PANIC,
+ * but it's not clear exactly how to calibrate that.
+ */
+ CHashTableIncrementStatistic(table, CHS_GC);
+ MyProc->hazard[0] = NULL;
+ for (i = 0; i < ProcGlobal->allProcCount; i++)
+ {
+ volatile PGPROC *proc = &ProcGlobal->allProcs[i];
+ void *hazard;
+
+ hazard = proc->hazard[0];
+ if (hazard == b || hazard == fh)
+ {
+ CHashTableIncrementStatistic(table, CHS_GC_Spin);
+ do
+ {
+ hazard = proc->hazard[0];
+ } while (hazard == b || hazard == fh);
+ }
+ }
+
+ /* Remove one item from list to satisfy current allocation. */
+ new = garbage;
+ n = CHashTableGetNode(table, new);
+ pg_read_barrier_depends();
+ fhead = n->un.gcnext;
+
+ if (CHashPtrIsInvalid(fhead))
+ {
+ /*
+ * We have reclaimed exactly node, so there's nothing to put
+ * back on the free list. In this case (only) we need a
+ * memory barrier, because the reads above must complete
+ * before we overwrite n->un.gcnext with a new hashcode.
+ * (This is only needed when we reclaim exactly one node,
+ * because in any other case we'll do a compare-and-swap
+ * before returning, which implies a full barrier.)
+ */
+ pg_memory_barrier();
+ CHashTableIncrementStatistic(table, CHS_GC_Reclaim_Skipped);
+ }
+ else if (__sync_bool_compare_and_swap(fh, InvalidCHashPtr, fhead))
+ {
+ /*
+ * Our free list is empty, and we've succesfully pushed the
+ * reclaimed nodes onto it. So we're done.
+ */
+ CHashTableIncrementStatistic(table, CHS_GC_Reclaim_Fast);
+ }
+ else
+ {
+ CHashPtr fcurrent;
+ CHashPtr fnext;
+ CHashPtr oldhead;
+
+ /* Walk list of reclaimed elements to end. */
+ fcurrent = fhead;
+ for (;;)
+ {
+ n = CHashTableGetNode(table, fcurrent);
+ fnext = n->un.gcnext;
+ if (CHashPtrIsInvalid(fnext))
+ break;
+ fcurrent = fnext;
+ }
+
+ /* Push reclaimed elements onto home free list. */
+ for (;;)
+ {
+ oldhead = *fh;
+ n->un.gcnext = oldhead;
+ if (__sync_bool_compare_and_swap(fh, oldhead, fhead))
+ break;
+ CHashTableIncrementStatistic(table, CHS_GC_Reclaim_Retry);
+ }
+ }
+
+ /* Return the element we saved for ourselves. */
+ return new;
+}
+
+/*
+ * Add an arena slot to the appropriate garbage list.
+ *
+ * The next garbage collection cycle for the affected bucket will move it
+ * to the free list. We can't do that immediately because there might be
+ * someone traversing the list who is counting on being able to follow the
+ * next pointer. It's OK to clobber the hash value, though, since a spurious
+ * failure to match an already-deleted item shouldn't cause any problems;
+ * this is why gcnext can share space with the hash value.
+ */
+static void
+CHashAddToGarbage(CHashTable table, uint32 bucket, CHashPtr c)
+{
+ CHashPtr g;
+ CHashNode *n;
+ CHashPtr *garbage;
+
+ n = CHashTableGetNode(table, c);
+ garbage = CHashTableGetGarbageByBucket(table, bucket);
+
+ while (1)
+ {
+ g = *garbage;
+ n->un.gcnext = g;
+ if (__sync_bool_compare_and_swap(garbage, g, c))
+ break;
+ CHashTableIncrementStatistic(table, CHS_Garbage_Enqueue_Retry);
+ }
+}
diff --git a/src/include/storage/barrier.h b/src/include/storage/barrier.h
index cd85633..dbcc0f8 100644
--- a/src/include/storage/barrier.h
+++ b/src/include/storage/barrier.h
@@ -20,4 +20,12 @@
*/
#include "port/atomics.h"
+/*
+ * If dependency barriers are undefined, we define them as a no-op. The only
+ * known platform where anything more is required is DEC Alpha.
+ */
+#if !defined(pg_read_barrier_depends)
+#define pg_read_barrier_depends()
+#endif
+
#endif /* BARRIER_H */
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 9b8ace5..b58af88 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -96,20 +96,6 @@ typedef struct buftag
)
/*
- * The shared buffer mapping table is partitioned to reduce contention.
- * To determine which partition lock a given tag requires, compute the tag's
- * hash code with BufTableHashCode(), then apply BufMappingPartitionLock().
- * NB: NUM_BUFFER_PARTITIONS must be a power of 2!
- */
-#define BufTableHashPartition(hashcode) \
- ((hashcode) % NUM_BUFFER_PARTITIONS)
-#define BufMappingPartitionLock(hashcode) \
- (&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + \
- BufTableHashPartition(hashcode)].lock)
-#define BufMappingPartitionLockByIndex(i) \
- (&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + (i)].lock)
-
-/*
* BufferDesc -- shared descriptor/state data for a single shared buffer.
*
* Note: buf_hdr_lock must be held to examine or change the tag, flags,
@@ -196,14 +182,6 @@ extern void StrategyNotifyBgWriter(int bgwprocno);
extern Size StrategyShmemSize(void);
extern void StrategyInitialize(bool init);
-/* buf_table.c */
-extern Size BufTableShmemSize(int size);
-extern void InitBufTable(int size);
-extern uint32 BufTableHashCode(BufferTag *tagPtr);
-extern int BufTableLookup(BufferTag *tagPtr, uint32 hashcode);
-extern int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id);
-extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode);
-
/* localbuf.c */
extern void LocalPrefetchBuffer(SMgrRelation smgr, ForkNumber forkNum,
BlockNumber blockNum);
@@ -215,4 +193,8 @@ extern void DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum,
extern void DropRelFileNodeAllLocalBuffers(RelFileNode rnode);
extern void AtEOXact_LocalBuffers(bool isCommit);
+/* bufmgr.c */
+extern Size BufMgrShmemSize(int size);
+extern void BufMgrInitShmem(int size);
+
#endif /* BUFMGR_INTERNALS_H */
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index e3c2efc..1b37447 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -144,7 +144,7 @@ extern PGDLLIMPORT LWLockPadded *MainLWLockArray;
*/
/* Number of partitions of the shared buffer mapping hashtable */
-#define NUM_BUFFER_PARTITIONS 128
+#define NUM_BUFFER_PARTITIONS 0
/* Number of partitions the shared lock tables are divided into */
#define LOG2_NUM_LOCK_PARTITIONS 4
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index d194f38..8d9b4dd 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -59,6 +59,14 @@ struct XidCache
#define FP_LOCK_SLOTS_PER_BACKEND 16
/*
+ * Some lock-free algorithms require each backend process to be able to
+ * advertise a certain number of "hazard pointers" in shared memory.
+ * Right now one per backend is enough for our purpose, but some
+ * algorithms require more.
+ */
+#define NUM_HAZARD_POINTERS 1
+
+/*
* Each backend has a PGPROC struct in shared memory. There is also a list of
* currently-unused PGPROC structs that will be reallocated to new backends.
*
@@ -143,6 +151,12 @@ struct PGPROC
bool fpVXIDLock; /* are we holding a fast-path VXID lock? */
LocalTransactionId fpLocalTransactionId; /* lxid for fast-path VXID
* lock */
+
+ /*
+ * Hazard pointers. Currently one is enough, though some algorithms
+ * require a few more.
+ */
+ void *hazard[NUM_HAZARD_POINTERS];
};
/* NOTE: "typedef struct PGPROC PGPROC" appears in storage/lock.h. */
diff --git a/src/include/storage/shmem.h b/src/include/storage/shmem.h
index c94d620..855b65e 100644
--- a/src/include/storage/shmem.h
+++ b/src/include/storage/shmem.h
@@ -40,6 +40,7 @@ extern void InitShmemIndex(void);
extern HTAB *ShmemInitHash(const char *name, long init_size, long max_size,
HASHCTL *infoP, int hash_flags);
extern void *ShmemInitStruct(const char *name, Size size, bool *foundPtr);
+extern void *ShmemAttachStruct(const char *name);
extern Size add_size(Size s1, Size s2);
extern Size mul_size(Size s1, Size s2);
diff --git a/src/include/utils/chash.h b/src/include/utils/chash.h
new file mode 100644
index 0000000..ee0573c
--- /dev/null
+++ b/src/include/utils/chash.h
@@ -0,0 +1,69 @@
+/*-------------------------------------------------------------------------
+ *
+ * chash.h
+ * Concurrent shared-memory hash table.
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/utils/chash.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef CHASH_H
+#define CHASH_H
+
+/* Everything caller must supply to set up a concurrent hash table. */
+typedef struct
+{
+ const char *shmem_name; /* shared memory name for this hash table */
+ uint32 capacity; /* maximum size of hash table */
+ uint16 element_size; /* size of each element */
+ uint16 key_size; /* size of each key */
+} CHashDescriptor;
+
+/* Concurrent hash table statistics. */
+typedef enum
+{
+ CHS_Search, /* search */
+ CHS_Search_Failed, /* search failed (no such key) */
+ CHS_Insert, /* insert */
+ CHS_Insert_Failed, /* insert failed (duplicate key) */
+ CHS_Insert_Retry, /* insert retried (concurrent update) */
+ CHS_Delete, /* delete */
+ CHS_Delete_Failed, /* delete failed (no such key) */
+ CHS_Delete_Retry, /* delete retried (concurrent update) */
+ CHS_Scan_Expunge, /* scan expunged deleted item */
+ CHS_Scan_Expunge_Fail, /* scan failed to expunge */
+ CHS_Scan_Restart, /* concurrent deletes forced a scan restart */
+ CHS_Cleanup_Scan, /* concurrent update forced a cleanup scan */
+ CHS_Allocate_Fail, /* allocation failed to pop freelist */
+ CHS_Garbage_Enqueue_Retry, /* enqueue on garbage list retried */
+ CHS_Garbage_Dequeue_Fail, /* dequeue of garbage failed */
+ CHS_GC, /* garbage collection cycle */
+ CHS_GC_Spin, /* GC spun waiting for concurrent process */
+ CHS_GC_Reclaim_Skipped, /* GC recovered only one item */
+ CHS_GC_Reclaim_Fast, /* GC put garbage on freelist via fast path */
+ CHS_GC_Reclaim_Retry, /* enqueue of garbage on freelist retried */
+ CHS_NumberOfStatistics /* number of statistics */
+} CHashStatisticsType;
+
+/* Human-readable names for statistics. */
+extern char *CHashStatisticsNames[];
+
+/* Opaque handle for a concurrent hash table. */
+struct CHashTableData;
+typedef struct CHashTableData *CHashTable;
+
+/* Initialization functions. */
+extern CHashTable CHashBootstrap(CHashDescriptor *desc);
+extern Size CHashEstimateSize(CHashTable table);
+extern CHashTable CHashInitialize(CHashTable table, CHashDescriptor *desc);
+
+/* Accessor functions. */
+extern bool CHashInsert(CHashTable table, void *entry);
+extern bool CHashDelete(CHashTable table, void *key);
+extern bool CHashSearch(CHashTable table, void *entry);
+extern void CHashStatistics(CHashTable table, uint64 *stats);
+
+#endif /* CHASH_H */
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers