On 2014-10-10 16:41:39 +0200, Andres Freund wrote:
> FWIW, the profile always looks like
> -  48.61%      postgres  postgres              [.] s_lock
>    - s_lock
>       + 96.67% StrategyGetBuffer
>       + 1.19% UnpinBuffer
>       + 0.90% PinBuffer
>       + 0.70% hash_search_with_hash_value
> +   3.11%      postgres  postgres              [.] GetSnapshotData
> +   2.47%      postgres  postgres              [.] StrategyGetBuffer
> +   1.93%      postgres  [kernel.kallsyms]     [k] copy_user_generic_string
> +   1.28%      postgres  postgres              [.] hash_search_with_hash_value
> -   1.27%      postgres  postgres              [.] LWLockAttemptLock
>    - LWLockAttemptLock
>       - 97.78% LWLockAcquire
>          + 38.76% ReadBuffer_common
>          + 28.62% _bt_getbuf
>          + 8.59% _bt_relandgetbuf
>          + 6.25% GetSnapshotData
>          + 5.93% VirtualXactLockTableInsert
>          + 3.95% VirtualXactLockTableCleanup
>          + 2.35% index_fetch_heap
>          + 1.66% StartBufferIO
>          + 1.56% LockReleaseAll
>          + 1.55% _bt_next
>          + 0.78% LockAcquireExtended
>       + 1.47% _bt_next
>       + 0.75% _bt_relandgetbuf
> 
> to me. Now that's with the client count 496, but it's similar with lower
> counts.
> 
> BTW, that profile *clearly* indicates we should make StrategyGetBuffer()
> smarter.

Which is nearly trivial now that atomics are in. Check out the attached
WIP patch which eliminates the spinlock from StrategyGetBuffer() unless
there's buffers on the freelist.

Test:
pgbench  -M prepared -P 5 -S -c 496 -j 496 -T 5000
on a scale=1000 database, with 4GB of shared buffers.

Before:
progress: 40.0 s, 136252.3 tps, lat 3.628 ms stddev 4.547
progress: 45.0 s, 135049.0 tps, lat 3.660 ms stddev 4.515
progress: 50.0 s, 135788.9 tps, lat 3.640 ms stddev 4.398
progress: 55.0 s, 135268.4 tps, lat 3.654 ms stddev 4.469
progress: 60.0 s, 134991.6 tps, lat 3.661 ms stddev 4.739

after:
progress: 40.0 s, 207701.1 tps, lat 2.382 ms stddev 3.018
progress: 45.0 s, 208022.4 tps, lat 2.377 ms stddev 2.902
progress: 50.0 s, 209187.1 tps, lat 2.364 ms stddev 2.970
progress: 55.0 s, 206462.7 tps, lat 2.396 ms stddev 2.871
progress: 60.0 s, 210263.8 tps, lat 2.351 ms stddev 2.914

Yes, no kidding.

The results are similar, but less extreme, for smaller client counts
like 80 or 160.

Amit, since your test seems to be currently completely bottlenecked
within StrategyGetBuffer(), could you compare with that patch applied to
HEAD and the LW_SHARED patch for one client count? That'll may allow us
to see a more meaningful profile...

Greetings,

Andres Freund

-- 
 Andres Freund                     http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training & Services
>From 6b486e5b467e94ab9297d7656a5b39b816c5c55a Mon Sep 17 00:00:00 2001
From: Andres Freund <and...@anarazel.de>
Date: Fri, 10 Oct 2014 17:36:46 +0200
Subject: [PATCH] WIP: lockless StrategyGetBuffer hotpath

---
 src/backend/storage/buffer/freelist.c | 154 ++++++++++++++++++++--------------
 1 file changed, 90 insertions(+), 64 deletions(-)

diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 5966beb..0c634a0 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -18,6 +18,12 @@
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
 
+#include "port/atomics.h"
+
+
+#define LATCHPTR_ACCESS_ONCE(var)	((Latch *)(*((volatile Latch **)&(var))))
+#define INT_ACCESS_ONCE(var)	((int)(*((volatile int *)&(var))))
+
 
 /*
  * The shared freelist control information.
@@ -27,8 +33,12 @@ typedef struct
 	/* Spinlock: protects the values below */
 	slock_t		buffer_strategy_lock;
 
-	/* Clock sweep hand: index of next buffer to consider grabbing */
-	int			nextVictimBuffer;
+	/*
+	 * Clock sweep hand: index of next buffer to consider grabbing. Note that
+	 * this isn't a concrete buffer - we only ever increase the value. So, to
+	 * get an actual buffer, it needs to be used modulo NBuffers.
+	 */
+	pg_atomic_uint32 nextVictimBuffer;
 
 	int			firstFreeBuffer;	/* Head of list of unused buffers */
 	int			lastFreeBuffer; /* Tail of list of unused buffers */
@@ -42,8 +52,8 @@ typedef struct
 	 * Statistics.  These counters should be wide enough that they can't
 	 * overflow during a single bgwriter cycle.
 	 */
-	uint32		completePasses; /* Complete cycles of the clock sweep */
-	uint32		numBufferAllocs;	/* Buffers allocated since last reset */
+	pg_atomic_uint32 completePasses; /* Complete cycles of the clock sweep */
+	pg_atomic_uint32 numBufferAllocs;	/* Buffers allocated since last reset */
 
 	/*
 	 * Notification latch, or NULL if none.  See StrategyNotifyBgWriter.
@@ -124,87 +134,107 @@ StrategyGetBuffer(BufferAccessStrategy strategy)
 			return buf;
 	}
 
-	/* Nope, so lock the freelist */
-	SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
-
 	/*
 	 * We count buffer allocation requests so that the bgwriter can estimate
 	 * the rate of buffer consumption.  Note that buffers recycled by a
 	 * strategy object are intentionally not counted here.
 	 */
-	StrategyControl->numBufferAllocs++;
+	pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1);
 
 	/*
-	 * If bgwriterLatch is set, we need to waken the bgwriter, but we should
-	 * not do so while holding buffer_strategy_lock; so release and re-grab.
-	 * This is annoyingly tedious, but it happens at most once per bgwriter
-	 * cycle, so the performance hit is minimal.
+	 * If bgwriterLatch is set, we need to waken the bgwriter. We don't want
+	 * to check this while holding the spinlock, so we the latch from memory
+	 * once to see whether it's set. There's no need to do so with a lock
+	 * present - we'll just set the latch next call if we missed it once.
+	 *
+	 * Since we're not guaranteed atomic 8 byte reads we need to acquire the
+	 * spinlock if not null to be sure we get a correct pointer. Because we
+	 * don't want to set the latch while holding the buffer_strategy_lock we
+	 * just grab the lock to read and reset the pointer.
 	 */
-	bgwriterLatch = StrategyControl->bgwriterLatch;
+	bgwriterLatch = LATCHPTR_ACCESS_ONCE(StrategyControl->bgwriterLatch);
 	if (bgwriterLatch)
 	{
+		/* we don't have guaranteed atomic 64bit reads */
+		SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
+		bgwriterLatch = LATCHPTR_ACCESS_ONCE(StrategyControl->bgwriterLatch);
 		StrategyControl->bgwriterLatch = NULL;
 		SpinLockRelease(&StrategyControl->buffer_strategy_lock);
-		SetLatch(bgwriterLatch);
-		SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
+
+		/* recheck */
+		if (bgwriterLatch)
+			SetLatch(bgwriterLatch);
 	}
 
 	/*
-	 * Try to get a buffer from the freelist.  Note that the freeNext fields
-	 * are considered to be protected by the buffer_strategy_lock not the
-	 * individual buffer spinlocks, so it's OK to manipulate them without
-	 * holding the spinlock.
+	 * First check, without acquiring the lock, wether there's buffers in the
+	 * freelist. Since we otherwise don't require the spinlock in every
+	 * StrategyGetBuffer() invocation, it'd be sad to acquire it here -
+	 * uselessly in mos tcases.
+	 *
+	 * If there's buffers on the freelist, acquire the spinlock and look into
+	 * them.
+	 *
+	 * Note that the freeNext fields are considered to be protected by
+	 * the buffer_strategy_lock not the individual buffer spinlocks, so it's
+	 * OK to manipulate them without holding the spinlock.
 	 */
-	while (StrategyControl->firstFreeBuffer >= 0)
+	if (INT_ACCESS_ONCE(StrategyControl->firstFreeBuffer) >= 0)
 	{
-		buf = &BufferDescriptors[StrategyControl->firstFreeBuffer];
-		Assert(buf->freeNext != FREENEXT_NOT_IN_LIST);
+		SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
 
-		/* Unconditionally remove buffer from freelist */
-		StrategyControl->firstFreeBuffer = buf->freeNext;
-		buf->freeNext = FREENEXT_NOT_IN_LIST;
+		while (StrategyControl->firstFreeBuffer >= 0)
+		{
+			buf = &BufferDescriptors[StrategyControl->firstFreeBuffer];
+			Assert(buf->freeNext != FREENEXT_NOT_IN_LIST);
 
-		/*
-		 * Release the lock so someone else can access the freelist (or run
-		 * the clocksweep) while we check out this buffer.
-		 */
-		SpinLockRelease(&StrategyControl->buffer_strategy_lock);
+			/* Unconditionally remove buffer from freelist */
+			StrategyControl->firstFreeBuffer = buf->freeNext;
+			buf->freeNext = FREENEXT_NOT_IN_LIST;
 
-		/*
-		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
-		 * it; discard it and retry.  (This can only happen if VACUUM put a
-		 * valid buffer in the freelist and then someone else used it before
-		 * we got to it.  It's probably impossible altogether as of 8.3, but
-		 * we'd better check anyway.)
-		 */
-		LockBufHdr(buf);
-		if (buf->refcount == 0 && buf->usage_count == 0)
-		{
-			if (strategy != NULL)
-				AddBufferToRing(strategy, buf);
-			return buf;
-		}
-		UnlockBufHdr(buf);
+			/*
+			 * Release the lock so someone else can access the freelist (or run
+			 * the clocksweep) while we check out this buffer.
+			 */
+			SpinLockRelease(&StrategyControl->buffer_strategy_lock);
 
-		/* Reacquire the lock and go around for another pass. */
-		SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
+			/*
+			 * If the buffer is pinned or has a nonzero usage_count, we cannot use
+			 * it; discard it and retry.  (This can only happen if VACUUM put a
+			 * valid buffer in the freelist and then someone else used it before
+			 * we got to it.  It's probably impossible altogether as of 8.3, but
+			 * we'd better check anyway.)
+			 */
+			LockBufHdr(buf);
+			if (buf->refcount == 0 && buf->usage_count == 0)
+			{
+				if (strategy != NULL)
+					AddBufferToRing(strategy, buf);
+				return buf;
+			}
+			UnlockBufHdr(buf);
+
+			/* Reacquire the lock and go around for another pass. */
+			SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
+		}
+		SpinLockRelease(&StrategyControl->buffer_strategy_lock);
 	}
 
 	/* Nothing on the freelist, so run the "clock sweep" algorithm */
 	trycounter = NBuffers;
 	for (;;)
 	{
-		buf = &BufferDescriptors[StrategyControl->nextVictimBuffer];
+		int victim;
+
+		victim = pg_atomic_fetch_add_u32(&StrategyControl->nextVictimBuffer, 1);
 
-		if (++StrategyControl->nextVictimBuffer >= NBuffers)
+		buf = &BufferDescriptors[victim % NBuffers];
+
+		if (victim % NBuffers == 0)
 		{
-			StrategyControl->nextVictimBuffer = 0;
-			StrategyControl->completePasses++;
+			pg_atomic_add_fetch_u32(&StrategyControl->completePasses, 1);
 		}
 
-		/* Release the lock before manipulating the candidate buffer. */
-		SpinLockRelease(&StrategyControl->buffer_strategy_lock);
-
 		/*
 		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
 		 * it; decrement the usage_count (unless pinned) and keep scanning.
@@ -238,9 +268,6 @@ StrategyGetBuffer(BufferAccessStrategy strategy)
 			elog(ERROR, "no unpinned buffers available");
 		}
 		UnlockBufHdr(buf);
-
-		/* Reacquire the lock and get a new candidate buffer. */
-		SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
 	}
 }
 
@@ -284,13 +311,12 @@ StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
 	int			result;
 
 	SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
-	result = StrategyControl->nextVictimBuffer;
+	result = pg_atomic_read_u32(&StrategyControl->nextVictimBuffer) % NBuffers;
 	if (complete_passes)
-		*complete_passes = StrategyControl->completePasses;
+		*complete_passes = pg_atomic_read_u32(&StrategyControl->completePasses);
 	if (num_buf_alloc)
 	{
-		*num_buf_alloc = StrategyControl->numBufferAllocs;
-		StrategyControl->numBufferAllocs = 0;
+		*num_buf_alloc = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocs, 0);
 	}
 	SpinLockRelease(&StrategyControl->buffer_strategy_lock);
 	return result;
@@ -389,11 +415,11 @@ StrategyInitialize(bool init)
 		StrategyControl->lastFreeBuffer = NBuffers - 1;
 
 		/* Initialize the clock sweep pointer */
-		StrategyControl->nextVictimBuffer = 0;
+		pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0);
 
 		/* Clear statistics */
-		StrategyControl->completePasses = 0;
-		StrategyControl->numBufferAllocs = 0;
+		pg_atomic_init_u32(&StrategyControl->completePasses, 0);
+		pg_atomic_init_u32(&StrategyControl->numBufferAllocs, 0);
 
 		/* No pending notification */
 		StrategyControl->bgwriterLatch = NULL;
-- 
1.8.3.251.g1462b67

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to