[HACKERS] Better LWLocks with compare-and-swap (9.4)

Heikki Linnakangas Mon, 13 May 2013 05:51:07 -0700

I've been working on-and-off on the WAL-insert scaling patch. It's inpretty good shape now, and I'll post it shortly, but one thing I noticedis that it benefits a lot from using an atomic compare-and-swapinstruction for the contention-critical part.

I realized that we could also use compare-and-swap to make LWLocks scalebetter. The LWLock struct is too large to compare-and-swap atomically,but we can still use CAS to increment/decrement the shared/exclusivecounters, when there's no need to manipulate the wait queue. That wouldhelp with workloads where you have a lot of CPUs, and a lot of backendsneed to acquire the same lwlock in shared mode, but there's no realcontention (ie. few exclusive lockers).

pgbench -S is such a workload. With 9.3beta1, I'm seeing this profile,when I run "pgbench -S -c64 -j64 -T60 -M prepared" on a 32-core Linuxmachine:


-  64.09%  postgres  postgres           [.] tas
   - tas
      - 99.83% s_lock
         - 53.22% LWLockAcquire
            + 99.87% GetSnapshotData
         - 46.78% LWLockRelease
              GetSnapshotData
            + GetTransactionSnapshot
+   2.97%  postgres  postgres           [.] tas
+   1.53%  postgres  libc-2.13.so       [.] 0x119873
+   1.44%  postgres  postgres           [.] GetSnapshotData
+   1.29%  postgres  [kernel.kallsyms]  [k] arch_local_irq_enable
+   1.18%  postgres  postgres           [.] AllocSetAlloc
...

So, on this test, a lot of time is wasted spinning on the mutex ofProcArrayLock. If you plot a graph of TPS vs. # of clients, there is asurprisingly steep drop in performance once you go beyond 29 clients(attached, pgbench-lwlock-cas-local-clients-sets.png, red line). Mytheory is that after that point all the cores are busy, and processesstart to be sometimes context switched while holding the spinlock, whichkills performance. Has anyone else seen that pattern? Curiously, I don'tsee that when connecting pgbench via TCP over localhost, only whenconnecting via unix domain sockets. Overall performance is higher overunix domain sockets, so I guess the TCP layer adds some overhead,hurting performance, and also affects scheduling somehow, making thesteep drop go away.

Using a compare-and-swap instruction in place of spinning solves theproblem (green line in attached graph). This needs some more testingwith different workloads to make sure it doesn't hurt other scenarios,but I don't think it will. I'm also waiting for a colleague to test thison a different machine, where he saw a similar steep drop in performance.

The attached patch is still work-in-progress. There needs to be aconfigure test and fallback to spinlock if a CAS instruction is notavailable. I used the gcc __sync_val_compare_and_swap() builtindirectly, that needs to be abstracted. Also, in the case that the waitqueue needs to be manipulated, the code spins on the CAS instruction,but there is no delay mechanism like there is on a regular spinlock;that needs to be added in somehow.


- Heikki

<<attachment: pgbench-lwlock-cas-local-clients-sets.png>>

diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index 4f88d3f..0d92a30 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -36,13 +36,27 @@
 /* We use the ShmemLock spinlock to protect LWLockAssign */
 extern slock_t *ShmemLock;
 
+/*
+ * The part of LWLock struct that needs to be swappable with an atomic
+ * compare-and-swap instruction. Assuming we have an atomic 64-bit CAS
+ * instruction, sizeof(cas_t) must be <= 8.
+ */
+typedef union
+{
+	struct
+	{
+		bool	mutex;			/* Protects LWLock and queue of PGPROCs */
+		bool	releaseOK;		/* T if ok to release waiters */
+		bool	waiters;		/* T if wait queue is not empty (head != NULL) */
+		char	exclusive;		/* # of exclusive holders (0 or 1) */
+		uint32	shared;			/* # of shared holders (0..MaxBackends) */
+	} f;
+	uint64 val;
+} LWLock_cas_t;
 
 typedef struct LWLock
 {
-	slock_t		mutex;			/* Protects LWLock and queue of PGPROCs */
-	bool		releaseOK;		/* T if ok to release waiters */
-	char		exclusive;		/* # of exclusive holders (0 or 1) */
-	int			shared;			/* # of shared holders (0..MaxBackends) */
+	LWLock_cas_t		casval;
 	PGPROC	   *head;			/* head of list of waiting PGPROCs */
 	PGPROC	   *tail;			/* tail of list of waiting PGPROCs */
 	/* tail is undefined when head is NULL */
@@ -267,6 +281,9 @@ CreateLWLocks(void)
 	char	   *ptr;
 	int			id;
 
+	StaticAssertStmt(sizeof(LWLock_cas_t) == sizeof(uint64),
+					 "unexpected size of LWLock_cas_t");
+
 	/* Allocate space */
 	ptr = (char *) ShmemAlloc(spaceLocks);
 
@@ -283,10 +300,10 @@ CreateLWLocks(void)
 	 */
 	for (id = 0, lock = LWLockArray; id < numLocks; id++, lock++)
 	{
-		SpinLockInit(&lock->lock.mutex);
-		lock->lock.releaseOK = true;
-		lock->lock.exclusive = 0;
-		lock->lock.shared = 0;
+		lock->lock.casval.val = 0; /* clear possible padding */
+		lock->lock.casval.f.releaseOK = true;
+		lock->lock.casval.f.exclusive = 0;
+		lock->lock.casval.f.shared = 0;
 		lock->lock.head = NULL;
 		lock->lock.tail = NULL;
 	}
@@ -344,6 +361,7 @@ LWLockAcquire(LWLockId lockid, LWLockMode mode)
 	PGPROC	   *proc = MyProc;
 	bool		retry = false;
 	int			extraWaits = 0;
+	LWLock_cas_t oldval;
 
 	PRINT_LWDEBUG("LWLockAcquire", lockid, lock);
 
@@ -392,27 +410,30 @@ LWLockAcquire(LWLockId lockid, LWLockMode mode)
 	 * cycle because the lock is not free when a released waiter finally gets
 	 * to run.	See pgsql-hackers archives for 29-Dec-01.
 	 */
+	oldval.val = 0;
+	oldval.f.mutex = false;
+	oldval.f.releaseOK = true;
+	oldval.f.exclusive = 0;
+	oldval.f.shared = 0;
 	for (;;)
 	{
 		bool		mustwait;
+		LWLock_cas_t newval;
+		LWLock_cas_t x;
 
 		/* Acquire mutex.  Time spent holding mutex should be short! */
-#ifdef LWLOCK_STATS
-		spin_delay_counts[lockid] += SpinLockAcquire(&lock->mutex);
-#else
-		SpinLockAcquire(&lock->mutex);
-#endif
+		newval = oldval;
 
 		/* If retrying, allow LWLockRelease to release waiters again */
 		if (retry)
-			lock->releaseOK = true;
+			newval.f.releaseOK = true;
 
 		/* If I can get the lock, do so quickly. */
 		if (mode == LW_EXCLUSIVE)
 		{
-			if (lock->exclusive == 0 && lock->shared == 0)
+			if (oldval.f.exclusive == 0 && oldval.f.shared == 0)
 			{
-				lock->exclusive++;
+				newval.f.exclusive++;
 				mustwait = false;
 			}
 			else
@@ -420,14 +441,29 @@ LWLockAcquire(LWLockId lockid, LWLockMode mode)
 		}
 		else
 		{
-			if (lock->exclusive == 0)
+			if (oldval.f.exclusive == 0)
 			{
-				lock->shared++;
+				newval.f.shared++;
 				mustwait = false;
 			}
 			else
 				mustwait = true;
 		}
+		newval.f.mutex = mustwait;
+		oldval.f.mutex = false;
+		x.val = __sync_val_compare_and_swap(&lock->casval.val, oldval.val, newval.val);
+		if (x.val != oldval.val)
+		{
+			/* failed to acquire lock or mutex, retry */
+			oldval = x;
+			continue;
+		}
+
+#ifdef LWLOCK_STATS
+		spin_delay_counts[lockid] += SpinLockAcquire(&lock->mutex);
+#else
+		//SpinLockAcquire(&lock->mutex);
+#endif
 
 		if (!mustwait)
 			break;				/* got the lock */
@@ -452,7 +488,11 @@ LWLockAcquire(LWLockId lockid, LWLockMode mode)
 		lock->tail = proc;
 
 		/* Can release the mutex now */
-		SpinLockRelease(&lock->mutex);
+		oldval = newval;
+		newval.f.waiters = true;
+		newval.f.mutex = false;
+		x.val = __sync_val_compare_and_swap(&lock->casval.val, oldval.val, newval.val);
+		Assert(x.val == oldval.val);
 
 		/*
 		 * Wait until awakened.
@@ -492,7 +532,6 @@ LWLockAcquire(LWLockId lockid, LWLockMode mode)
 	}
 
 	/* We are done updating shared state of the lock itself. */
-	SpinLockRelease(&lock->mutex);
 
 	TRACE_POSTGRESQL_LWLOCK_ACQUIRE(lockid, mode);
 
@@ -518,6 +557,9 @@ LWLockConditionalAcquire(LWLockId lockid, LWLockMode mode)
 {
 	volatile LWLock *lock = &(LWLockArray[lockid].lock);
 	bool		mustwait;
+	LWLock_cas_t oldval;
+	LWLock_cas_t newval;
+	LWLock_cas_t x;
 
 	PRINT_LWDEBUG("LWLockConditionalAcquire", lockid, lock);
 
@@ -533,14 +575,20 @@ LWLockConditionalAcquire(LWLockId lockid, LWLockMode mode)
 	HOLD_INTERRUPTS();
 
 	/* Acquire mutex.  Time spent holding mutex should be short! */
-	SpinLockAcquire(&lock->mutex);
-
+	oldval.val = 0;
+	oldval.f.mutex = false;
+	oldval.f.releaseOK = true;
+	oldval.f.exclusive = 0;
+	oldval.f.shared = 0;
+
+retry:
+	newval = oldval;
 	/* If I can get the lock, do so quickly. */
 	if (mode == LW_EXCLUSIVE)
 	{
-		if (lock->exclusive == 0 && lock->shared == 0)
+		if (oldval.f.exclusive == 0 && oldval.f.shared == 0)
 		{
-			lock->exclusive++;
+			newval.f.exclusive++;
 			mustwait = false;
 		}
 		else
@@ -548,17 +596,29 @@ LWLockConditionalAcquire(LWLockId lockid, LWLockMode mode)
 	}
 	else
 	{
-		if (lock->exclusive == 0)
+		if (oldval.f.exclusive == 0)
 		{
-			lock->shared++;
+			newval.f.shared++;
 			mustwait = false;
 		}
 		else
 			mustwait = true;
 	}
 
+	if (!mustwait)
+	{
+		newval.f.mutex = false;
+		oldval.f.mutex = false;
+		x.val = __sync_val_compare_and_swap(&lock->casval.val, oldval.val, newval.val);
+		if (x.val != oldval.val)
+		{
+			/* XXX would it be better to give up on CAS race? */
+			oldval = x;
+			goto retry;
+		}
+	}
+
 	/* We are done updating shared state of the lock itself. */
-	SpinLockRelease(&lock->mutex);
 
 	if (mustwait)
 	{
@@ -598,6 +658,9 @@ LWLockAcquireOrWait(LWLockId lockid, LWLockMode mode)
 	PGPROC	   *proc = MyProc;
 	bool		mustwait;
 	int			extraWaits = 0;
+	LWLock_cas_t oldval;
+	LWLock_cas_t newval;
+	LWLock_cas_t x;
 
 	PRINT_LWDEBUG("LWLockAcquireOrWait", lockid, lock);
 
@@ -619,14 +682,21 @@ LWLockAcquireOrWait(LWLockId lockid, LWLockMode mode)
 	HOLD_INTERRUPTS();
 
 	/* Acquire mutex.  Time spent holding mutex should be short! */
-	SpinLockAcquire(&lock->mutex);
+	oldval.val = 0;
+	oldval.f.mutex = false;
+	oldval.f.releaseOK = true;
+	oldval.f.exclusive = 0;
+	oldval.f.shared = 0;
+
+retry:
+	newval = oldval;
 
 	/* If I can get the lock, do so quickly. */
 	if (mode == LW_EXCLUSIVE)
 	{
-		if (lock->exclusive == 0 && lock->shared == 0)
+		if (oldval.f.exclusive == 0 && oldval.f.shared == 0)
 		{
-			lock->exclusive++;
+			newval.f.exclusive++;
 			mustwait = false;
 		}
 		else
@@ -634,15 +704,24 @@ LWLockAcquireOrWait(LWLockId lockid, LWLockMode mode)
 	}
 	else
 	{
-		if (lock->exclusive == 0)
+		if (oldval.f.exclusive == 0)
 		{
-			lock->shared++;
+			newval.f.shared++;
 			mustwait = false;
 		}
 		else
 			mustwait = true;
 	}
 
+	newval.f.mutex = mustwait;
+	oldval.f.mutex = false;
+	x.val = __sync_val_compare_and_swap(&lock->casval.val, oldval.val, newval.val);
+	if (x.val != oldval.val)
+	{
+		oldval = x;
+		goto retry;
+	}
+
 	if (mustwait)
 	{
 		/*
@@ -665,7 +744,11 @@ LWLockAcquireOrWait(LWLockId lockid, LWLockMode mode)
 		lock->tail = proc;
 
 		/* Can release the mutex now */
-		SpinLockRelease(&lock->mutex);
+		oldval = newval;
+		newval.f.mutex = false;
+		newval.f.waiters = true;
+		x.val = __sync_val_compare_and_swap(&lock->casval.val, oldval.val, newval.val);
+		Assert(x.val == oldval.val);
 
 		/*
 		 * Wait until awakened.  Like in LWLockAcquire, be prepared for bogus
@@ -694,8 +777,7 @@ LWLockAcquireOrWait(LWLockId lockid, LWLockMode mode)
 	}
 	else
 	{
-		/* We are done updating shared state of the lock itself. */
-		SpinLockRelease(&lock->mutex);
+		/* No need to update the lock */
 	}
 
 	/*
@@ -731,6 +813,9 @@ LWLockRelease(LWLockId lockid)
 	PGPROC	   *head;
 	PGPROC	   *proc;
 	int			i;
+	LWLock_cas_t oldval;
+	LWLock_cas_t newval;
+	LWLock_cas_t x;
 
 	PRINT_LWDEBUG("LWLockRelease", lockid, lock);
 
@@ -749,16 +834,38 @@ LWLockRelease(LWLockId lockid)
 	for (; i < num_held_lwlocks; i++)
 		held_lwlocks[i] = held_lwlocks[i + 1];
 
-	/* Acquire mutex.  Time spent holding mutex should be short! */
-	SpinLockAcquire(&lock->mutex);
+	oldval.val = 0;
+	oldval.f.mutex = false;
+	oldval.f.releaseOK = true;
+	/* guess that we're holding the lock in exclusive mode, and no waiters */
+	oldval.f.waiters = 0;
+	oldval.f.exclusive = 1;
+	oldval.f.shared = 0;
+
+retry:
+	newval = oldval;
 
 	/* Release my hold on lock */
-	if (lock->exclusive > 0)
-		lock->exclusive--;
+	if (oldval.f.exclusive > 0)
+		newval.f.exclusive--;
 	else
 	{
-		Assert(lock->shared > 0);
-		lock->shared--;
+		Assert(oldval.f.shared > 0);
+		newval.f.shared--;
+	}
+
+	/*
+	 * If there are waiters, need to acquire mutex to manipulate the waiter
+	 * list.
+	 */
+	newval.f.mutex = oldval.f.waiters;
+	oldval.f.mutex = false;
+	x.val = __sync_val_compare_and_swap(&lock->casval.val, oldval.val, newval.val);
+	if (x.val != oldval.val)
+	{
+		/* someone else was holding mutex, retry */
+		oldval = x;
+		goto retry;
 	}
 
 	/*
@@ -767,10 +874,17 @@ LWLockRelease(LWLockId lockid)
 	 * if someone has already awakened waiters that haven't yet acquired the
 	 * lock.
 	 */
-	head = lock->head;
+	if (oldval.f.waiters)
+	{
+		head = lock->head;
+		Assert(head != NULL);
+	}
+	else
+		head = NULL;
 	if (head != NULL)
 	{
-		if (lock->exclusive == 0 && lock->shared == 0 && lock->releaseOK)
+		oldval = newval;
+		if (newval.f.exclusive == 0 && newval.f.shared == 0 && newval.f.releaseOK)
 		{
 			/*
 			 * Remove the to-be-awakened PGPROCs from the queue.
@@ -812,17 +926,23 @@ LWLockRelease(LWLockId lockid)
 			if (proc->lwWaitMode != LW_WAIT_UNTIL_FREE)
 				releaseOK = false;
 
-			lock->releaseOK = releaseOK;
+			newval.f.releaseOK = releaseOK;
 		}
 		else
 		{
 			/* lock is still held, can't awaken anything */
 			head = NULL;
 		}
+
+		/* release mutex */
+		newval.f.mutex = false;
+		if (lock->head == NULL)
+			newval.f.waiters = false;
+		x.val = __sync_val_compare_and_swap(&lock->casval.val, oldval.val, newval.val);
+		Assert(x.val == oldval.val);
 	}
 
 	/* We are done updating shared state of the lock itself. */
-	SpinLockRelease(&lock->mutex);
 
 	TRACE_POSTGRESQL_LWLOCK_RELEASE(lockid);

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

[HACKERS] Better LWLocks with compare-and-swap (9.4)

Reply via email to