On 2014-10-10 16:41:39 +0200, Andres Freund wrote: > FWIW, the profile always looks like > - 48.61% postgres postgres [.] s_lock > - s_lock > + 96.67% StrategyGetBuffer > + 1.19% UnpinBuffer > + 0.90% PinBuffer > + 0.70% hash_search_with_hash_value > + 3.11% postgres postgres [.] GetSnapshotData > + 2.47% postgres postgres [.] StrategyGetBuffer > + 1.93% postgres [kernel.kallsyms] [k] copy_user_generic_string > + 1.28% postgres postgres [.] hash_search_with_hash_value > - 1.27% postgres postgres [.] LWLockAttemptLock > - LWLockAttemptLock > - 97.78% LWLockAcquire > + 38.76% ReadBuffer_common > + 28.62% _bt_getbuf > + 8.59% _bt_relandgetbuf > + 6.25% GetSnapshotData > + 5.93% VirtualXactLockTableInsert > + 3.95% VirtualXactLockTableCleanup > + 2.35% index_fetch_heap > + 1.66% StartBufferIO > + 1.56% LockReleaseAll > + 1.55% _bt_next > + 0.78% LockAcquireExtended > + 1.47% _bt_next > + 0.75% _bt_relandgetbuf > > to me. Now that's with the client count 496, but it's similar with lower > counts. > > BTW, that profile *clearly* indicates we should make StrategyGetBuffer() > smarter.
Which is nearly trivial now that atomics are in. Check out the attached WIP patch which eliminates the spinlock from StrategyGetBuffer() unless there's buffers on the freelist. Test: pgbench -M prepared -P 5 -S -c 496 -j 496 -T 5000 on a scale=1000 database, with 4GB of shared buffers. Before: progress: 40.0 s, 136252.3 tps, lat 3.628 ms stddev 4.547 progress: 45.0 s, 135049.0 tps, lat 3.660 ms stddev 4.515 progress: 50.0 s, 135788.9 tps, lat 3.640 ms stddev 4.398 progress: 55.0 s, 135268.4 tps, lat 3.654 ms stddev 4.469 progress: 60.0 s, 134991.6 tps, lat 3.661 ms stddev 4.739 after: progress: 40.0 s, 207701.1 tps, lat 2.382 ms stddev 3.018 progress: 45.0 s, 208022.4 tps, lat 2.377 ms stddev 2.902 progress: 50.0 s, 209187.1 tps, lat 2.364 ms stddev 2.970 progress: 55.0 s, 206462.7 tps, lat 2.396 ms stddev 2.871 progress: 60.0 s, 210263.8 tps, lat 2.351 ms stddev 2.914 Yes, no kidding. The results are similar, but less extreme, for smaller client counts like 80 or 160. Amit, since your test seems to be currently completely bottlenecked within StrategyGetBuffer(), could you compare with that patch applied to HEAD and the LW_SHARED patch for one client count? That'll may allow us to see a more meaningful profile... Greetings, Andres Freund -- Andres Freund http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Training & Services
>From 6b486e5b467e94ab9297d7656a5b39b816c5c55a Mon Sep 17 00:00:00 2001 From: Andres Freund <and...@anarazel.de> Date: Fri, 10 Oct 2014 17:36:46 +0200 Subject: [PATCH] WIP: lockless StrategyGetBuffer hotpath --- src/backend/storage/buffer/freelist.c | 154 ++++++++++++++++++++-------------- 1 file changed, 90 insertions(+), 64 deletions(-) diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 5966beb..0c634a0 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -18,6 +18,12 @@ #include "storage/buf_internals.h" #include "storage/bufmgr.h" +#include "port/atomics.h" + + +#define LATCHPTR_ACCESS_ONCE(var) ((Latch *)(*((volatile Latch **)&(var)))) +#define INT_ACCESS_ONCE(var) ((int)(*((volatile int *)&(var)))) + /* * The shared freelist control information. @@ -27,8 +33,12 @@ typedef struct /* Spinlock: protects the values below */ slock_t buffer_strategy_lock; - /* Clock sweep hand: index of next buffer to consider grabbing */ - int nextVictimBuffer; + /* + * Clock sweep hand: index of next buffer to consider grabbing. Note that + * this isn't a concrete buffer - we only ever increase the value. So, to + * get an actual buffer, it needs to be used modulo NBuffers. + */ + pg_atomic_uint32 nextVictimBuffer; int firstFreeBuffer; /* Head of list of unused buffers */ int lastFreeBuffer; /* Tail of list of unused buffers */ @@ -42,8 +52,8 @@ typedef struct * Statistics. These counters should be wide enough that they can't * overflow during a single bgwriter cycle. */ - uint32 completePasses; /* Complete cycles of the clock sweep */ - uint32 numBufferAllocs; /* Buffers allocated since last reset */ + pg_atomic_uint32 completePasses; /* Complete cycles of the clock sweep */ + pg_atomic_uint32 numBufferAllocs; /* Buffers allocated since last reset */ /* * Notification latch, or NULL if none. See StrategyNotifyBgWriter. @@ -124,87 +134,107 @@ StrategyGetBuffer(BufferAccessStrategy strategy) return buf; } - /* Nope, so lock the freelist */ - SpinLockAcquire(&StrategyControl->buffer_strategy_lock); - /* * We count buffer allocation requests so that the bgwriter can estimate * the rate of buffer consumption. Note that buffers recycled by a * strategy object are intentionally not counted here. */ - StrategyControl->numBufferAllocs++; + pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1); /* - * If bgwriterLatch is set, we need to waken the bgwriter, but we should - * not do so while holding buffer_strategy_lock; so release and re-grab. - * This is annoyingly tedious, but it happens at most once per bgwriter - * cycle, so the performance hit is minimal. + * If bgwriterLatch is set, we need to waken the bgwriter. We don't want + * to check this while holding the spinlock, so we the latch from memory + * once to see whether it's set. There's no need to do so with a lock + * present - we'll just set the latch next call if we missed it once. + * + * Since we're not guaranteed atomic 8 byte reads we need to acquire the + * spinlock if not null to be sure we get a correct pointer. Because we + * don't want to set the latch while holding the buffer_strategy_lock we + * just grab the lock to read and reset the pointer. */ - bgwriterLatch = StrategyControl->bgwriterLatch; + bgwriterLatch = LATCHPTR_ACCESS_ONCE(StrategyControl->bgwriterLatch); if (bgwriterLatch) { + /* we don't have guaranteed atomic 64bit reads */ + SpinLockAcquire(&StrategyControl->buffer_strategy_lock); + bgwriterLatch = LATCHPTR_ACCESS_ONCE(StrategyControl->bgwriterLatch); StrategyControl->bgwriterLatch = NULL; SpinLockRelease(&StrategyControl->buffer_strategy_lock); - SetLatch(bgwriterLatch); - SpinLockAcquire(&StrategyControl->buffer_strategy_lock); + + /* recheck */ + if (bgwriterLatch) + SetLatch(bgwriterLatch); } /* - * Try to get a buffer from the freelist. Note that the freeNext fields - * are considered to be protected by the buffer_strategy_lock not the - * individual buffer spinlocks, so it's OK to manipulate them without - * holding the spinlock. + * First check, without acquiring the lock, wether there's buffers in the + * freelist. Since we otherwise don't require the spinlock in every + * StrategyGetBuffer() invocation, it'd be sad to acquire it here - + * uselessly in mos tcases. + * + * If there's buffers on the freelist, acquire the spinlock and look into + * them. + * + * Note that the freeNext fields are considered to be protected by + * the buffer_strategy_lock not the individual buffer spinlocks, so it's + * OK to manipulate them without holding the spinlock. */ - while (StrategyControl->firstFreeBuffer >= 0) + if (INT_ACCESS_ONCE(StrategyControl->firstFreeBuffer) >= 0) { - buf = &BufferDescriptors[StrategyControl->firstFreeBuffer]; - Assert(buf->freeNext != FREENEXT_NOT_IN_LIST); + SpinLockAcquire(&StrategyControl->buffer_strategy_lock); - /* Unconditionally remove buffer from freelist */ - StrategyControl->firstFreeBuffer = buf->freeNext; - buf->freeNext = FREENEXT_NOT_IN_LIST; + while (StrategyControl->firstFreeBuffer >= 0) + { + buf = &BufferDescriptors[StrategyControl->firstFreeBuffer]; + Assert(buf->freeNext != FREENEXT_NOT_IN_LIST); - /* - * Release the lock so someone else can access the freelist (or run - * the clocksweep) while we check out this buffer. - */ - SpinLockRelease(&StrategyControl->buffer_strategy_lock); + /* Unconditionally remove buffer from freelist */ + StrategyControl->firstFreeBuffer = buf->freeNext; + buf->freeNext = FREENEXT_NOT_IN_LIST; - /* - * If the buffer is pinned or has a nonzero usage_count, we cannot use - * it; discard it and retry. (This can only happen if VACUUM put a - * valid buffer in the freelist and then someone else used it before - * we got to it. It's probably impossible altogether as of 8.3, but - * we'd better check anyway.) - */ - LockBufHdr(buf); - if (buf->refcount == 0 && buf->usage_count == 0) - { - if (strategy != NULL) - AddBufferToRing(strategy, buf); - return buf; - } - UnlockBufHdr(buf); + /* + * Release the lock so someone else can access the freelist (or run + * the clocksweep) while we check out this buffer. + */ + SpinLockRelease(&StrategyControl->buffer_strategy_lock); - /* Reacquire the lock and go around for another pass. */ - SpinLockAcquire(&StrategyControl->buffer_strategy_lock); + /* + * If the buffer is pinned or has a nonzero usage_count, we cannot use + * it; discard it and retry. (This can only happen if VACUUM put a + * valid buffer in the freelist and then someone else used it before + * we got to it. It's probably impossible altogether as of 8.3, but + * we'd better check anyway.) + */ + LockBufHdr(buf); + if (buf->refcount == 0 && buf->usage_count == 0) + { + if (strategy != NULL) + AddBufferToRing(strategy, buf); + return buf; + } + UnlockBufHdr(buf); + + /* Reacquire the lock and go around for another pass. */ + SpinLockAcquire(&StrategyControl->buffer_strategy_lock); + } + SpinLockRelease(&StrategyControl->buffer_strategy_lock); } /* Nothing on the freelist, so run the "clock sweep" algorithm */ trycounter = NBuffers; for (;;) { - buf = &BufferDescriptors[StrategyControl->nextVictimBuffer]; + int victim; + + victim = pg_atomic_fetch_add_u32(&StrategyControl->nextVictimBuffer, 1); - if (++StrategyControl->nextVictimBuffer >= NBuffers) + buf = &BufferDescriptors[victim % NBuffers]; + + if (victim % NBuffers == 0) { - StrategyControl->nextVictimBuffer = 0; - StrategyControl->completePasses++; + pg_atomic_add_fetch_u32(&StrategyControl->completePasses, 1); } - /* Release the lock before manipulating the candidate buffer. */ - SpinLockRelease(&StrategyControl->buffer_strategy_lock); - /* * If the buffer is pinned or has a nonzero usage_count, we cannot use * it; decrement the usage_count (unless pinned) and keep scanning. @@ -238,9 +268,6 @@ StrategyGetBuffer(BufferAccessStrategy strategy) elog(ERROR, "no unpinned buffers available"); } UnlockBufHdr(buf); - - /* Reacquire the lock and get a new candidate buffer. */ - SpinLockAcquire(&StrategyControl->buffer_strategy_lock); } } @@ -284,13 +311,12 @@ StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc) int result; SpinLockAcquire(&StrategyControl->buffer_strategy_lock); - result = StrategyControl->nextVictimBuffer; + result = pg_atomic_read_u32(&StrategyControl->nextVictimBuffer) % NBuffers; if (complete_passes) - *complete_passes = StrategyControl->completePasses; + *complete_passes = pg_atomic_read_u32(&StrategyControl->completePasses); if (num_buf_alloc) { - *num_buf_alloc = StrategyControl->numBufferAllocs; - StrategyControl->numBufferAllocs = 0; + *num_buf_alloc = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocs, 0); } SpinLockRelease(&StrategyControl->buffer_strategy_lock); return result; @@ -389,11 +415,11 @@ StrategyInitialize(bool init) StrategyControl->lastFreeBuffer = NBuffers - 1; /* Initialize the clock sweep pointer */ - StrategyControl->nextVictimBuffer = 0; + pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0); /* Clear statistics */ - StrategyControl->completePasses = 0; - StrategyControl->numBufferAllocs = 0; + pg_atomic_init_u32(&StrategyControl->completePasses, 0); + pg_atomic_init_u32(&StrategyControl->numBufferAllocs, 0); /* No pending notification */ StrategyControl->bgwriterLatch = NULL; -- 1.8.3.251.g1462b67
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers