Re: [HACKERS] Fix performance degradation of contended LWLock on NUMA

Sokolov Yura Tue, 18 Jul 2017 10:21:32 -0700

On 2017-06-05 16:22, Sokolov Yura wrote:

Good day, everyone.


This patch improves performance of contended LWLock.
Patch makes lock acquiring in single CAS loop:
1. LWLock->state is read, and ability for lock acquiring is detected.
  If there is possibility to take a lock, CAS tried.
  If CAS were successful, lock is aquired (same to original version).
2. but if lock is currently held by other backend, we check ability for
  taking WaitList lock. If wait list lock is not help by anyone, CAS

perfomed for taking WaitList lock and set LW_FLAG_HAS_WAITERS atonce.If CAS were successful, then LWLock were still held at the momentwait

  list lock were held - this proves correctness of new algorithm. And
  Proc is queued to wait list then.
3. Otherwise spin_delay is performed, and loop returns to step 1.


I'm sending rebased version with couple of one-line tweaks.

(less skip_wait_list on shared lock, and don't update spin-stat onaquiring)


With regards,
--
Sokolov Yura aka funny_falcon
Postgres Professional: https://postgrespro.ru
The Russian Postgres Company

From dc8d9489074973ba589adf9b0239e1467cbba44b Mon Sep 17 00:00:00 2001
From: Sokolov Yura <[email protected]>
Date: Mon, 29 May 2017 09:25:41 +0000
Subject: [PATCH 1/6] More correct use of spinlock inside LWLockWaitListLock.

SpinDelayStatus should be function global to count iterations correctly,
and produce more correct delays.

Also if spin didn't spin, do not count it in spins_per_delay correction.
It wasn't necessary before cause delayStatus were used only in contented
cases.
---
 src/backend/storage/lmgr/lwlock.c | 36 ++++++++++++++++--------------------
 src/backend/storage/lmgr/s_lock.c |  3 +++
 2 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index 82a1cf5150..86966b804e 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -323,6 +323,15 @@ get_lwlock_stats_entry(LWLock *lock)
 	}
 	return lwstats;
 }
+
+static void
+add_lwlock_stats_spin_stat(LWLock *lock, SpinDelayStatus *delayStatus)
+{
+	lwlock_stats *lwstats;
+
+	lwstats = get_lwlock_stats_entry(lock);
+	lwstats->spin_delay_count += delayStatus->delays;
+}
 #endif							/* LWLOCK_STATS */
 
 
@@ -800,13 +809,9 @@ static void
 LWLockWaitListLock(LWLock *lock)
 {
 	uint32		old_state;
-#ifdef LWLOCK_STATS
-	lwlock_stats *lwstats;
-	uint32		delays = 0;
-
-	lwstats = get_lwlock_stats_entry(lock);
-#endif
+	SpinDelayStatus delayStatus;
 
+	init_local_spin_delay(&delayStatus);
 	while (true)
 	{
 		/* always try once to acquire lock directly */
@@ -815,20 +820,10 @@ LWLockWaitListLock(LWLock *lock)
 			break;				/* got lock */
 
 		/* and then spin without atomic operations until lock is released */
+		while (old_state & LW_FLAG_LOCKED)
 		{
-			SpinDelayStatus delayStatus;
-
-			init_local_spin_delay(&delayStatus);
-
-			while (old_state & LW_FLAG_LOCKED)
-			{
-				perform_spin_delay(&delayStatus);
-				old_state = pg_atomic_read_u32(&lock->state);
-			}
-#ifdef LWLOCK_STATS
-			delays += delayStatus.delays;
-#endif
-			finish_spin_delay(&delayStatus);
+			perform_spin_delay(&delayStatus);
+			old_state = pg_atomic_read_u32(&lock->state);
 		}
 
 		/*
@@ -836,9 +831,10 @@ LWLockWaitListLock(LWLock *lock)
 		 * we're attempting to get it again.
 		 */
 	}
+	finish_spin_delay(&delayStatus);
 
 #ifdef LWLOCK_STATS
-	lwstats->spin_delay_count += delays;
+	add_lwlock_stats_spin_stat(lock, &delayStatus);
 #endif
 }
 
diff --git a/src/backend/storage/lmgr/s_lock.c b/src/backend/storage/lmgr/s_lock.c
index 40d8156331..f3e0fbc602 100644
--- a/src/backend/storage/lmgr/s_lock.c
+++ b/src/backend/storage/lmgr/s_lock.c
@@ -177,6 +177,9 @@ finish_spin_delay(SpinDelayStatus *status)
 	if (status->cur_delay == 0)
 	{
 		/* we never had to delay */
+		if (status->spins == 0)
+			/* but we didn't spin either, so ignore */
+			return;
 		if (spins_per_delay < MAX_SPINS_PER_DELAY)
 			spins_per_delay = Min(spins_per_delay + 100, MAX_SPINS_PER_DELAY);
 	}
-- 
2.11.0


From 067282cce98c77b9f1731cf7d302c37fcc70d59e Mon Sep 17 00:00:00 2001
From: Sokolov Yura <[email protected]>
Date: Tue, 30 May 2017 14:05:26 +0300
Subject: [PATCH 2/6] Perform atomic LWLock acquirement or putting into wait
 list.

Before this change, lwlock were acquired in several steps:
- first try acquire lockfree (one CAS-spin-loop)
- if it were not successful, queue self into wait list
  (two CAS-spin-loops (in fact, 3 loop, cause of atomic_fetch_and_or))
- try again to acquire lock (another one CAS loop)
- if second attempt were successful, deque our self from wait queue
  (two CAS spin loops).

With this change, lock acquired, or wait list lock acquired using single
CAS loop:
- if it is likely to acquire desired lock mode, we do CAS for that
- otherwise we are trying for CAS to lock wait list and set necessary
  flag (LG_FLAG_HAS_WAITERS) at once.
  (LWLockWaitListUnlock still performs second CAS loop for releasing
   wait list lock).

Correctness provided by the fact we are trying to lock wait list only
when some one holds conflicting lock. If conflicting lock were released,
CAS for wait list will not succeed, and we will retry to acquire lock.
---
 src/backend/storage/lmgr/lwlock.c | 289 ++++++++++++++++++++++----------------
 1 file changed, 171 insertions(+), 118 deletions(-)

diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index 86966b804e..c73f7430c9 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -727,20 +727,33 @@ GetLWLockIdentifier(uint32 classId, uint16 eventId)
 
 /*
  * Internal function that tries to atomically acquire the lwlock in the passed
- * in mode.
+ * in mode. If it could not grab the lock, it doesn't puts proc into wait
+ * queue.
  *
- * This function will not block waiting for a lock to become free - that's the
- * callers job.
+ * It is used only in LWLockConditionalAcquire.
  *
- * Returns true if the lock isn't free and we need to wait.
+ * Returns true if the lock isn't free.
  */
 static bool
-LWLockAttemptLock(LWLock *lock, LWLockMode mode)
+LWLockAttemptLockOnce(LWLock *lock, LWLockMode mode)
 {
 	uint32		old_state;
+	uint32		mask,
+				add;
 
 	AssertArg(mode == LW_EXCLUSIVE || mode == LW_SHARED);
 
+	if (mode == LW_EXCLUSIVE)
+	{
+		mask = LW_LOCK_MASK;
+		add = LW_VAL_EXCLUSIVE;
+	}
+	else
+	{
+		mask = LW_VAL_EXCLUSIVE;
+		add = LW_VAL_SHARED;
+	}
+
 	/*
 	 * Read once outside the loop, later iterations will get the newer value
 	 * via compare & exchange.
@@ -755,46 +768,126 @@ LWLockAttemptLock(LWLock *lock, LWLockMode mode)
 
 		desired_state = old_state;
 
-		if (mode == LW_EXCLUSIVE)
+		lock_free = (old_state & mask) == 0;
+		if (lock_free)
 		{
-			lock_free = (old_state & LW_LOCK_MASK) == 0;
-			if (lock_free)
-				desired_state += LW_VAL_EXCLUSIVE;
+			desired_state += add;
+			if (pg_atomic_compare_exchange_u32(&lock->state,
+											   &old_state, desired_state))
+			{
+				/* Great! Got the lock. */
+#ifdef LOCK_DEBUG
+				if (mode == LW_EXCLUSIVE)
+					lock->owner = MyProc;
+#endif
+				return false;
+			}
 		}
 		else
 		{
-			lock_free = (old_state & LW_VAL_EXCLUSIVE) == 0;
-			if (lock_free)
-				desired_state += LW_VAL_SHARED;
+			/*
+			 * This barrier is never needed for correctness, and it is no-op
+			 * on x86. But probably first iteration of cas loop in
+			 * ProcArrayGroupClearXid will succeed oftener with it.
+			 */
+			pg_read_barrier();
+			return true;
 		}
+	}
+	pg_unreachable();
+}
 
-		/*
-		 * Attempt to swap in the state we are expecting. If we didn't see
-		 * lock to be free, that's just the old value. If we saw it as free,
-		 * we'll attempt to mark it acquired. The reason that we always swap
-		 * in the value is that this doubles as a memory barrier. We could try
-		 * to be smarter and only swap in values if we saw the lock as free,
-		 * but benchmark haven't shown it as beneficial so far.
-		 *
-		 * Retry if the value changed since we last looked at it.
-		 */
-		if (pg_atomic_compare_exchange_u32(&lock->state,
-										   &old_state, desired_state))
+static void LWLockQueueSelfLocked(LWLock *lock, LWLockMode mode);
+
+/*
+ * Internal function that tries to atomically acquire the lwlock in the passed
+ * in mode or put it self into waiting queue with waitmode.
+ *
+ * This function will not block waiting for a lock to become free - that's the
+ * callers job.
+ *
+ * Returns true if the lock isn't free and we are in a wait queue.
+ */
+static inline bool
+LWLockAttemptLockOrQueueSelf(LWLock *lock, LWLockMode mode, LWLockMode waitmode)
+{
+	uint32		old_state;
+	SpinDelayStatus delayStatus;
+	bool		lock_free;
+	uint32		mask,
+				add;
+
+	AssertArg(mode == LW_EXCLUSIVE || mode == LW_SHARED);
+
+	if (mode == LW_EXCLUSIVE)
+	{
+		mask = LW_LOCK_MASK;
+		add = LW_VAL_EXCLUSIVE;
+	}
+	else
+	{
+		mask = LW_VAL_EXCLUSIVE;
+		add = LW_VAL_SHARED;
+	}
+
+	init_local_spin_delay(&delayStatus);
+
+	/*
+	 * Read once outside the loop. Later iterations will get the newer value
+	 * either via compare & exchange or with read after perform_spin_delay.
+	 */
+	old_state = pg_atomic_read_u32(&lock->state);
+	/* loop until we've determined whether we could acquire the lock or not */
+	while (true)
+	{
+		uint32		desired_state;
+
+		desired_state = old_state;
+
+		lock_free = (old_state & mask) == 0;
+		if (lock_free)
 		{
-			if (lock_free)
+			desired_state += add;
+			if (pg_atomic_compare_exchange_u32(&lock->state,
+											   &old_state, desired_state))
 			{
 				/* Great! Got the lock. */
 #ifdef LOCK_DEBUG
 				if (mode == LW_EXCLUSIVE)
 					lock->owner = MyProc;
 #endif
-				return false;
+				break;
+			}
+		}
+		else if ((old_state & LW_FLAG_LOCKED) == 0)
+		{
+			desired_state |= LW_FLAG_LOCKED | LW_FLAG_HAS_WAITERS;
+			if (pg_atomic_compare_exchange_u32(&lock->state,
+											   &old_state, desired_state))
+			{
+				LWLockQueueSelfLocked(lock, waitmode);
+				break;
 			}
-			else
-				return true;	/* somebody else has the lock */
+		}
+		else
+		{
+			perform_spin_delay(&delayStatus);
+			old_state = pg_atomic_read_u32(&lock->state);
 		}
 	}
-	pg_unreachable();
+#ifdef LWLOCK_STATS
+	add_lwlock_stats_spin_stat(lock, &delayStatus);
+#endif
+
+	/*
+	 * We intentionally do not call finish_spin_delay here, cause loop above
+	 * usually finished in queueing into wait list on contention, and doesn't
+	 * reach spins_per_delay (and so, doesn't sleep inside of
+	 * perform_spin_delay). Also, different LWLocks has very different
+	 * contention pattern, and it is wrong to update spin-lock statistic based
+	 * on LWLock contention.
+	 */
+	return !lock_free;
 }
 
 /*
@@ -960,7 +1053,7 @@ LWLockWakeup(LWLock *lock)
 }
 
 /*
- * Add ourselves to the end of the queue.
+ * Add ourselves to the end of the queue, acquiring waitlist lock.
  *
  * NB: Mode can be LW_WAIT_UNTIL_FREE here!
  */
@@ -983,6 +1076,19 @@ LWLockQueueSelf(LWLock *lock, LWLockMode mode)
 	/* setting the flag is protected by the spinlock */
 	pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_HAS_WAITERS);
 
+	LWLockQueueSelfLocked(lock, mode);
+}
+
+/*
+ * Add ourselves to the end of the queue.
+ * Lock should be held at this moment, and all neccessary flags are already
+ * set. It will release wait list lock.
+ *
+ * NB: Mode can be LW_WAIT_UNTIL_FREE here!
+ */
+static void
+LWLockQueueSelfLocked(LWLock *lock, LWLockMode mode)
+{
 	MyProc->lwWaiting = true;
 	MyProc->lwWaitMode = mode;
 
@@ -1165,11 +1271,7 @@ LWLockAcquire(LWLock *lock, LWLockMode mode)
 	{
 		bool		mustwait;
 
-		/*
-		 * Try to grab the lock the first time, we're not in the waitqueue
-		 * yet/anymore.
-		 */
-		mustwait = LWLockAttemptLock(lock, mode);
+		mustwait = LWLockAttemptLockOrQueueSelf(lock, mode, mode);
 
 		if (!mustwait)
 		{
@@ -1178,42 +1280,17 @@ LWLockAcquire(LWLock *lock, LWLockMode mode)
 		}
 
 		/*
-		 * Ok, at this point we couldn't grab the lock on the first try. We
-		 * cannot simply queue ourselves to the end of the list and wait to be
-		 * woken up because by now the lock could long have been released.
-		 * Instead add us to the queue and try to grab the lock again. If we
-		 * succeed we need to revert the queuing and be happy, otherwise we
-		 * recheck the lock. If we still couldn't grab it, we know that the
-		 * other locker will see our queue entries when releasing since they
-		 * existed before we checked for the lock.
-		 */
-
-		/* add to the queue */
-		LWLockQueueSelf(lock, mode);
-
-		/* we're now guaranteed to be woken up if necessary */
-		mustwait = LWLockAttemptLock(lock, mode);
-
-		/* ok, grabbed the lock the second time round, need to undo queueing */
-		if (!mustwait)
-		{
-			LOG_LWDEBUG("LWLockAcquire", lock, "acquired, undoing queue");
-
-			LWLockDequeueSelf(lock);
-			break;
-		}
-
-		/*
 		 * Wait until awakened.
 		 *
-		 * Since we share the process wait semaphore with the regular lock
-		 * manager and ProcWaitForSignal, and we may need to acquire an LWLock
-		 * while one of those is pending, it is possible that we get awakened
-		 * for a reason other than being signaled by LWLockRelease. If so,
-		 * loop back and wait again.  Once we've gotten the LWLock,
-		 * re-increment the sema by the number of additional signals received,
-		 * so that the lock manager or signal manager will see the received
-		 * signal when it next waits.
+		 * At this point we are already in a wait queue. Since we share the
+		 * process wait semaphore with the regular lock manager and
+		 * ProcWaitForSignal, and we may need to acquire an LWLock while one
+		 * of those is pending, it is possible that we get awakened for a
+		 * reason other than being signaled by LWLockRelease. If so, loop back
+		 * and wait again.  Once we've gotten the LWLock, re-increment the
+		 * sema by the number of additional signals received, so that the lock
+		 * manager or signal manager will see the received signal when it next
+		 * waits.
 		 */
 		LOG_LWDEBUG("LWLockAcquire", lock, "waiting");
 
@@ -1296,7 +1373,7 @@ LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
 	HOLD_INTERRUPTS();
 
 	/* Check for the lock */
-	mustwait = LWLockAttemptLock(lock, mode);
+	mustwait = LWLockAttemptLockOnce(lock, mode);
 
 	if (mustwait)
 	{
@@ -1357,67 +1434,43 @@ LWLockAcquireOrWait(LWLock *lock, LWLockMode mode)
 	 */
 	HOLD_INTERRUPTS();
 
-	/*
-	 * NB: We're using nearly the same twice-in-a-row lock acquisition
-	 * protocol as LWLockAcquire(). Check its comments for details.
-	 */
-	mustwait = LWLockAttemptLock(lock, mode);
+	mustwait = LWLockAttemptLockOrQueueSelf(lock, mode, LW_WAIT_UNTIL_FREE);
 
 	if (mustwait)
 	{
-		LWLockQueueSelf(lock, LW_WAIT_UNTIL_FREE);
-
-		mustwait = LWLockAttemptLock(lock, mode);
-
-		if (mustwait)
-		{
-			/*
-			 * Wait until awakened.  Like in LWLockAcquire, be prepared for
-			 * bogus wakeups, because we share the semaphore with
-			 * ProcWaitForSignal.
-			 */
-			LOG_LWDEBUG("LWLockAcquireOrWait", lock, "waiting");
+		/*
+		 * Wait until awakened.  Like in LWLockAcquire, be prepared for bogus
+		 * wakeups, because we share the semaphore with ProcWaitForSignal.
+		 */
+		LOG_LWDEBUG("LWLockAcquireOrWait", lock, "waiting");
 
 #ifdef LWLOCK_STATS
-			lwstats->block_count++;
+		lwstats->block_count++;
 #endif
 
-			LWLockReportWaitStart(lock);
-			TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), mode);
+		LWLockReportWaitStart(lock);
+		TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), mode);
 
-			for (;;)
-			{
-				PGSemaphoreLock(proc->sem);
-				if (!proc->lwWaiting)
-					break;
-				extraWaits++;
-			}
+		for (;;)
+		{
+			PGSemaphoreLock(proc->sem);
+			if (!proc->lwWaiting)
+				break;
+			extraWaits++;
+		}
 
 #ifdef LOCK_DEBUG
-			{
-				/* not waiting anymore */
-				uint32		nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1);
-
-				Assert(nwaiters < MAX_BACKENDS);
-			}
-#endif
-			TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), mode);
-			LWLockReportWaitEnd();
-
-			LOG_LWDEBUG("LWLockAcquireOrWait", lock, "awakened");
-		}
-		else
 		{
-			LOG_LWDEBUG("LWLockAcquireOrWait", lock, "acquired, undoing queue");
+			/* not waiting anymore */
+			uint32		nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1);
 
-			/*
-			 * Got lock in the second attempt, undo queueing. We need to treat
-			 * this as having successfully acquired the lock, otherwise we'd
-			 * not necessarily wake up people we've prevented from acquiring
-			 * the lock.
-			 */
-			LWLockDequeueSelf(lock);
+			Assert(nwaiters < MAX_BACKENDS);
 		}
+#endif
+		TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), mode);
+		LWLockReportWaitEnd();
+
+		LOG_LWDEBUG("LWLockAcquireOrWait", lock, "awakened");
 	}
 
 	/*
-- 
2.11.0


From 4bf5f8a3739cc32b5a996188e8766e7a472f2647 Mon Sep 17 00:00:00 2001
From: Sokolov Yura <[email protected]>
Date: Tue, 30 May 2017 17:21:48 +0300
Subject: [PATCH 3/6] use custom loop for LWLockWaitForVar

Previously LWLockWaitForVar used following procedure:
- check lock state and var value LWLockConflictsWithVar
- LWLockQueueSelf
- recheck lock state and var with LWLockConflictsWithVar
- LWLockDequeueSelf if lock were released by holder or variable changed.

With this change, LWLockConflictsWithVar performs single atomic:
- check lock state
-- if lock is not held, return "no need to wait"
-- otherwise try to lock wait list
- with lock on wait list check value of variable (return if it was changed)
- set flags on a state and queue self into wait list
-- if CAS for flags failed, recheck if lock still held.
   If it is not held, return.

With this change there is no need it LWLockQueueSelf and LWLockDequequeSelf
any more, so this functions are removed.
---
 src/backend/storage/lmgr/lwlock.c | 251 +++++++++++---------------------------
 1 file changed, 70 insertions(+), 181 deletions(-)

diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index c73f7430c9..487824ea9f 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -176,7 +176,6 @@ typedef struct lwlock_stats
 	int			sh_acquire_count;
 	int			ex_acquire_count;
 	int			block_count;
-	int			dequeue_self_count;
 	int			spin_delay_count;
 }			lwlock_stats;
 
@@ -284,11 +283,11 @@ print_lwlock_stats(int code, Datum arg)
 	while ((lwstats = (lwlock_stats *) hash_seq_search(&scan)) != NULL)
 	{
 		fprintf(stderr,
-				"PID %d lwlock %s %p: shacq %u exacq %u blk %u spindelay %u dequeue self %u\n",
+				"PID %d lwlock %s %p: shacq %u exacq %u blk %u spindelay %u\n",
 				MyProcPid, LWLockTrancheArray[lwstats->key.tranche],
 				lwstats->key.instance, lwstats->sh_acquire_count,
 				lwstats->ex_acquire_count, lwstats->block_count,
-				lwstats->spin_delay_count, lwstats->dequeue_self_count);
+				lwstats->spin_delay_count);
 	}
 
 	LWLockRelease(&MainLWLockArray[0].lock);
@@ -318,7 +317,6 @@ get_lwlock_stats_entry(LWLock *lock)
 		lwstats->sh_acquire_count = 0;
 		lwstats->ex_acquire_count = 0;
 		lwstats->block_count = 0;
-		lwstats->dequeue_self_count = 0;
 		lwstats->spin_delay_count = 0;
 	}
 	return lwstats;
@@ -1053,33 +1051,6 @@ LWLockWakeup(LWLock *lock)
 }
 
 /*
- * Add ourselves to the end of the queue, acquiring waitlist lock.
- *
- * NB: Mode can be LW_WAIT_UNTIL_FREE here!
- */
-static void
-LWLockQueueSelf(LWLock *lock, LWLockMode mode)
-{
-	/*
-	 * If we don't have a PGPROC structure, there's no way to wait. This
-	 * should never occur, since MyProc should only be null during shared
-	 * memory initialization.
-	 */
-	if (MyProc == NULL)
-		elog(PANIC, "cannot wait without a PGPROC structure");
-
-	if (MyProc->lwWaiting)
-		elog(PANIC, "queueing for lock while waiting on another one");
-
-	LWLockWaitListLock(lock);
-
-	/* setting the flag is protected by the spinlock */
-	pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_HAS_WAITERS);
-
-	LWLockQueueSelfLocked(lock, mode);
-}
-
-/*
  * Add ourselves to the end of the queue.
  * Lock should be held at this moment, and all neccessary flags are already
  * set. It will release wait list lock.
@@ -1108,100 +1079,6 @@ LWLockQueueSelfLocked(LWLock *lock, LWLockMode mode)
 }
 
 /*
- * Remove ourselves from the waitlist.
- *
- * This is used if we queued ourselves because we thought we needed to sleep
- * but, after further checking, we discovered that we don't actually need to
- * do so.
- */
-static void
-LWLockDequeueSelf(LWLock *lock)
-{
-	bool		found = false;
-	proclist_mutable_iter iter;
-
-#ifdef LWLOCK_STATS
-	lwlock_stats *lwstats;
-
-	lwstats = get_lwlock_stats_entry(lock);
-
-	lwstats->dequeue_self_count++;
-#endif
-
-	LWLockWaitListLock(lock);
-
-	/*
-	 * Can't just remove ourselves from the list, but we need to iterate over
-	 * all entries as somebody else could have dequeued us.
-	 */
-	proclist_foreach_modify(iter, &lock->waiters, lwWaitLink)
-	{
-		if (iter.cur == MyProc->pgprocno)
-		{
-			found = true;
-			proclist_delete(&lock->waiters, iter.cur, lwWaitLink);
-			break;
-		}
-	}
-
-	if (proclist_is_empty(&lock->waiters) &&
-		(pg_atomic_read_u32(&lock->state) & LW_FLAG_HAS_WAITERS) != 0)
-	{
-		pg_atomic_fetch_and_u32(&lock->state, ~LW_FLAG_HAS_WAITERS);
-	}
-
-	/* XXX: combine with fetch_and above? */
-	LWLockWaitListUnlock(lock);
-
-	/* clear waiting state again, nice for debugging */
-	if (found)
-		MyProc->lwWaiting = false;
-	else
-	{
-		int			extraWaits = 0;
-
-		/*
-		 * Somebody else dequeued us and has or will wake us up. Deal with the
-		 * superfluous absorption of a wakeup.
-		 */
-
-		/*
-		 * Reset releaseOk if somebody woke us before we removed ourselves -
-		 * they'll have set it to false.
-		 */
-		pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_RELEASE_OK);
-
-		/*
-		 * Now wait for the scheduled wakeup, otherwise our ->lwWaiting would
-		 * get reset at some inconvenient point later. Most of the time this
-		 * will immediately return.
-		 */
-		for (;;)
-		{
-			PGSemaphoreLock(MyProc->sem);
-			if (!MyProc->lwWaiting)
-				break;
-			extraWaits++;
-		}
-
-		/*
-		 * Fix the process wait semaphore's count for any absorbed wakeups.
-		 */
-		while (extraWaits-- > 0)
-			PGSemaphoreUnlock(MyProc->sem);
-	}
-
-#ifdef LOCK_DEBUG
-	{
-		/* not waiting anymore */
-		uint32		nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1);
-
-		Assert(nwaiters < MAX_BACKENDS);
-	}
-#endif
-}
-
-/*
  * LWLockAcquire - acquire a lightweight lock in the specified mode
  *
  * If the lock is not available, sleep until it is.  Returns true if the lock
@@ -1514,43 +1391,87 @@ LWLockConflictsWithVar(LWLock *lock,
 {
 	bool		mustwait;
 	uint64		value;
+	uint32		state;
+	SpinDelayStatus delayStatus;
 
-	/*
-	 * Test first to see if it the slot is free right now.
-	 *
-	 * XXX: the caller uses a spinlock before this, so we don't need a memory
-	 * barrier here as far as the current usage is concerned.  But that might
-	 * not be safe in general.
-	 */
-	mustwait = (pg_atomic_read_u32(&lock->state) & LW_VAL_EXCLUSIVE) != 0;
-
-	if (!mustwait)
-	{
-		*result = true;
-		return false;
-	}
-
-	*result = false;
+	init_local_spin_delay(&delayStatus);
 
 	/*
+	 * We are trying to detect exclusive lock on state, or value change. If
+	 * neither happens, we put ourself into wait queue.
+	 *
 	 * Read value using the lwlock's wait list lock, as we can't generally
 	 * rely on atomic 64 bit reads/stores.  TODO: On platforms with a way to
 	 * do atomic 64 bit reads/writes the spinlock should be optimized away.
 	 */
-	LWLockWaitListLock(lock);
-	value = *valptr;
-	LWLockWaitListUnlock(lock);
+	while (true)
+	{
+		state = pg_atomic_read_u32(&lock->state);
+		mustwait = (state & LW_VAL_EXCLUSIVE) != 0;
 
-	if (value != oldval)
+		if (!mustwait)
+		{
+			*result = true;
+			goto exit;
+		}
+
+		if ((state & LW_FLAG_LOCKED) == 0)
+		{
+			if (pg_atomic_compare_exchange_u32(&lock->state, &state,
+											   state | LW_FLAG_LOCKED))
+				break;
+		}
+		perform_spin_delay(&delayStatus);
+	}
+
+	/* Now we've locked wait queue, and someone holds EXCLUSIVE lock. */
+	value = *valptr;
+	mustwait = value == oldval;
+	if (!mustwait)
 	{
-		mustwait = false;
+		/* value changed, no need to block */
+		LWLockWaitListUnlock(lock);
 		*newval = value;
+		goto exit;
 	}
-	else
+
+	/*
+	 * If value didn't change, we have to set LW_FLAG_HAS_WAITERS and
+	 * LW_FLAG_RELEASE_OK flags and put ourself into wait queue. We could find
+	 * that the lock is not held anymore, in this case we should return
+	 * without blocking.
+	 *
+	 * Note: value could not change again cause we are holding WaitList lock.
+	 */
+	while (true)
 	{
-		mustwait = true;
-	}
+		uint32		desired_state;
 
+		desired_state = state | LW_FLAG_HAS_WAITERS | LW_FLAG_RELEASE_OK;
+		if (pg_atomic_compare_exchange_u32(&lock->state, &state,
+										   desired_state))
+		{
+			/*
+			 * we satisfy invariant: flags should be set while someone holds
+			 * the lock
+			 */
+			LWLockQueueSelfLocked(lock, LW_WAIT_UNTIL_FREE);
+			break;
+		}
+		if ((state & LW_VAL_EXCLUSIVE) == 0)
+		{
+			/* lock was released, no need to block anymore */
+			LWLockWaitListUnlock(lock);
+			*result = true;
+			mustwait = false;
+			break;
+		}
+	}
+exit:
+#ifdef LWLOCK_STATS
+	add_lwlock_stats_spin_stat(lock, &delayStatus);
+#endif
+	finish_spin_delay(&delayStatus);
 	return mustwait;
 }
 
@@ -1602,38 +1523,6 @@ LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval)
 			break;				/* the lock was free or value didn't match */
 
 		/*
-		 * Add myself to wait queue. Note that this is racy, somebody else
-		 * could wakeup before we're finished queuing. NB: We're using nearly
-		 * the same twice-in-a-row lock acquisition protocol as
-		 * LWLockAcquire(). Check its comments for details. The only
-		 * difference is that we also have to check the variable's values when
-		 * checking the state of the lock.
-		 */
-		LWLockQueueSelf(lock, LW_WAIT_UNTIL_FREE);
-
-		/*
-		 * Set RELEASE_OK flag, to make sure we get woken up as soon as the
-		 * lock is released.
-		 */
-		pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_RELEASE_OK);
-
-		/*
-		 * We're now guaranteed to be woken up if necessary. Recheck the lock
-		 * and variables state.
-		 */
-		mustwait = LWLockConflictsWithVar(lock, valptr, oldval, newval,
-										  &result);
-
-		/* Ok, no conflict after we queued ourselves. Undo queueing. */
-		if (!mustwait)
-		{
-			LOG_LWDEBUG("LWLockWaitForVar", lock, "free, undoing queue");
-
-			LWLockDequeueSelf(lock);
-			break;
-		}
-
-		/*
 		 * Wait until awakened.
 		 *
 		 * Since we share the process wait semaphore with the regular lock
-- 
2.11.0


From c6ea4c993b2e6951c724abc7a9d3094d942adbcb Mon Sep 17 00:00:00 2001
From: Sokolov Yura <[email protected]>
Date: Wed, 31 May 2017 13:56:30 +0000
Subject: [PATCH 4/6] Make acquiring LWLock to look more like spinlock.

If delayStatus.spins <= spin_per_delay/8, then do not try to add
itself into wait list, but instead do perform_spin_delay and retry
lock acquiring. Since delayStatus.spins is reset after each pg_usleep,
it spins after sleep again.

It greatly improves performance on "contended but not much" cases,
and overall performance also.
---
 src/backend/storage/lmgr/lwlock.c | 10 ++++++++--
 src/backend/storage/lmgr/s_lock.c |  2 +-
 src/include/storage/s_lock.h      |  2 ++
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index 487824ea9f..50e0d53ec6 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -814,6 +814,7 @@ LWLockAttemptLockOrQueueSelf(LWLock *lock, LWLockMode mode, LWLockMode waitmode)
 	bool		lock_free;
 	uint32		mask,
 				add;
+	int			skip_wait_list;
 
 	AssertArg(mode == LW_EXCLUSIVE || mode == LW_SHARED);
 
@@ -821,11 +822,13 @@ LWLockAttemptLockOrQueueSelf(LWLock *lock, LWLockMode mode, LWLockMode waitmode)
 	{
 		mask = LW_LOCK_MASK;
 		add = LW_VAL_EXCLUSIVE;
+		skip_wait_list = spins_per_delay / 8;
 	}
 	else
 	{
 		mask = LW_VAL_EXCLUSIVE;
 		add = LW_VAL_SHARED;
+		skip_wait_list = spins_per_delay / 4;
 	}
 
 	init_local_spin_delay(&delayStatus);
@@ -857,7 +860,7 @@ LWLockAttemptLockOrQueueSelf(LWLock *lock, LWLockMode mode, LWLockMode waitmode)
 				break;
 			}
 		}
-		else if ((old_state & LW_FLAG_LOCKED) == 0)
+		else if (delayStatus.spins > skip_wait_list && (old_state & LW_FLAG_LOCKED) == 0)
 		{
 			desired_state |= LW_FLAG_LOCKED | LW_FLAG_HAS_WAITERS;
 			if (pg_atomic_compare_exchange_u32(&lock->state,
@@ -1393,9 +1396,12 @@ LWLockConflictsWithVar(LWLock *lock,
 	uint64		value;
 	uint32		state;
 	SpinDelayStatus delayStatus;
+	int			skip_wait_list;
 
 	init_local_spin_delay(&delayStatus);
 
+	skip_wait_list = spins_per_delay / 4;
+
 	/*
 	 * We are trying to detect exclusive lock on state, or value change. If
 	 * neither happens, we put ourself into wait queue.
@@ -1415,7 +1421,7 @@ LWLockConflictsWithVar(LWLock *lock,
 			goto exit;
 		}
 
-		if ((state & LW_FLAG_LOCKED) == 0)
+		if (delayStatus.spins > skip_wait_list && (state & LW_FLAG_LOCKED) == 0)
 		{
 			if (pg_atomic_compare_exchange_u32(&lock->state, &state,
 											   state | LW_FLAG_LOCKED))
diff --git a/src/backend/storage/lmgr/s_lock.c b/src/backend/storage/lmgr/s_lock.c
index f3e0fbc602..ce1f08656c 100644
--- a/src/backend/storage/lmgr/s_lock.c
+++ b/src/backend/storage/lmgr/s_lock.c
@@ -63,7 +63,7 @@
 
 slock_t		dummy_spinlock;
 
-static int	spins_per_delay = DEFAULT_SPINS_PER_DELAY;
+int			spins_per_delay = DEFAULT_SPINS_PER_DELAY;
 
 
 /*
diff --git a/src/include/storage/s_lock.h b/src/include/storage/s_lock.h
index bbf505e246..1dd4c7880d 100644
--- a/src/include/storage/s_lock.h
+++ b/src/include/storage/s_lock.h
@@ -964,6 +964,8 @@ extern int s_lock(volatile slock_t *lock, const char *file, int line, const char
 
 /* Support for dynamic adjustment of spins_per_delay */
 #define DEFAULT_SPINS_PER_DELAY  100
+/* Read in lwlock.c to adjust behavior */
+extern int spins_per_delay;
 
 extern void set_spins_per_delay(int shared_spins_per_delay);
 extern int	update_spins_per_delay(int shared_spins_per_delay);
-- 
2.11.0


From 877001bc013e8426b3cfdc9c28434f8c9a99b1da Mon Sep 17 00:00:00 2001
From: Sokolov Yura <[email protected]>
Date: Tue, 30 May 2017 18:54:25 +0300
Subject: [PATCH 5/6] Fix implementation description in a lwlock.c .

---
 src/backend/storage/lmgr/lwlock.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index 50e0d53ec6..d38d031d90 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -62,16 +62,15 @@
  * notice that we have to wait. Unfortunately by the time we have finished
  * queuing, the former locker very well might have already finished it's
  * work. That's problematic because we're now stuck waiting inside the OS.
-
- * To mitigate those races we use a two phased attempt at locking:
- *	 Phase 1: Try to do it atomically, if we succeed, nice
- *	 Phase 2: Add ourselves to the waitqueue of the lock
- *	 Phase 3: Try to grab the lock again, if we succeed, remove ourselves from
- *			  the queue
- *	 Phase 4: Sleep till wake-up, goto Phase 1
  *
- * This protects us against the problem from above as nobody can release too
- *	  quick, before we're queued, since after Phase 2 we're already queued.
+ * This race is avoiding by taking lock for wait list using CAS with a old
+ * state value, so it could succeed only if lock is still held. Necessary flag
+ * is set with same CAS.
+ *
+ * Inside LWLockConflictsWithVar wait list lock is reused to protect variable
+ * value. So first it is acquired to check variable value, then flags are
+ * set with second CAS before queueing to wait list to ensure lock were not
+ * released yet.
  * -------------------------------------------------------------------------
  */
 #include "postgres.h"
-- 
2.11.0


From 1dacce7f45edc4f234266665ba3ce88a28588fda Mon Sep 17 00:00:00 2001
From: Sokolov Yura <[email protected]>
Date: Fri, 2 Jun 2017 11:34:23 +0000
Subject: [PATCH 6/6] Make SpinDelayStatus a bit lighter.

It saves couple of instruction of fast path of spin-loop, and so makes
fast path of LWLock a bit faster (and in other places where spinlock is
used).
Also it makes call to perform_spin_delay a bit slower, that could
positively affect on spin behavior, especially if no `pause` instruction
present.
---
 src/backend/storage/lmgr/s_lock.c |  9 +++++----
 src/include/storage/s_lock.h      | 16 ++++++----------
 2 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/src/backend/storage/lmgr/s_lock.c b/src/backend/storage/lmgr/s_lock.c
index ce1f08656c..11c0720951 100644
--- a/src/backend/storage/lmgr/s_lock.c
+++ b/src/backend/storage/lmgr/s_lock.c
@@ -93,11 +93,11 @@ s_lock(volatile slock_t *lock, const char *file, int line, const char *func)
 {
 	SpinDelayStatus delayStatus;
 
-	init_spin_delay(&delayStatus, file, line, func);
+	init_spin_delay(&delayStatus);
 
 	while (TAS_SPIN(lock))
 	{
-		perform_spin_delay(&delayStatus);
+		perform_spin_delay_with_info(&delayStatus, file, line, func);
 	}
 
 	finish_spin_delay(&delayStatus);
@@ -122,7 +122,8 @@ s_unlock(volatile slock_t *lock)
  * Wait while spinning on a contended spinlock.
  */
 void
-perform_spin_delay(SpinDelayStatus *status)
+perform_spin_delay_with_info(SpinDelayStatus *status,
+							 const char *file, int line, const char *func)
 {
 	/* CPU-specific delay each time through the loop */
 	SPIN_DELAY();
@@ -131,7 +132,7 @@ perform_spin_delay(SpinDelayStatus *status)
 	if (++(status->spins) >= spins_per_delay)
 	{
 		if (++(status->delays) > NUM_DELAYS)
-			s_lock_stuck(status->file, status->line, status->func);
+			s_lock_stuck(file, line, func);
 
 		if (status->cur_delay == 0) /* first time to delay? */
 			status->cur_delay = MIN_DELAY_USEC;
diff --git a/src/include/storage/s_lock.h b/src/include/storage/s_lock.h
index 1dd4c7880d..e5c4645461 100644
--- a/src/include/storage/s_lock.h
+++ b/src/include/storage/s_lock.h
@@ -979,25 +979,21 @@ typedef struct
 	int			spins;
 	int			delays;
 	int			cur_delay;
-	const char *file;
-	int			line;
-	const char *func;
 } SpinDelayStatus;
 
 static inline void
-init_spin_delay(SpinDelayStatus *status,
-				const char *file, int line, const char *func)
+init_spin_delay(SpinDelayStatus *status)
 {
 	status->spins = 0;
 	status->delays = 0;
 	status->cur_delay = 0;
-	status->file = file;
-	status->line = line;
-	status->func = func;
 }
 
-#define init_local_spin_delay(status) init_spin_delay(status, __FILE__, __LINE__, PG_FUNCNAME_MACRO)
-void perform_spin_delay(SpinDelayStatus *status);
+#define init_local_spin_delay(status) init_spin_delay(status)
+void perform_spin_delay_with_info(SpinDelayStatus *status,
+		const char *file, int line, const char *func);
+#define perform_spin_delay(status) \
+	perform_spin_delay_with_info(status, __FILE__, __LINE__, PG_FUNCNAME_MACRO)
 void finish_spin_delay(SpinDelayStatus *status);
 
 #endif	 /* S_LOCK_H */
-- 
2.11.0

-- 
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] Fix performance degradation of contended LWLock on NUMA

Reply via email to