On 2014-12-10 21:52:17 -0500, Robert Haas wrote: > Maybe you could store the pgprocno instead of the PROC *.
That's a good idea. Here's a patch implementing that and other things. Changes: * The handling of wraparound is slight changed. There's a separate patch for the case where nextVictimBuffer is above NBuffers. That allows a) to avoid the somewhat expensive modulo operation in the common case b) always consistent results for StrategySyncStart() * StrategySyncStart() doesn't have a situation in which it can return inaccurate information anymore. That could actually trigger an assertion bgwriter. * There was a bug because the local victim variable was signed instead of unsigned. * Clock sweep ticks are moved into a separate routine. Comments? Greetings, Andres Freund -- Andres Freund http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Training & Services
>From b757437498365f1f8f76fe878ec263e25cb8347f Mon Sep 17 00:00:00 2001 From: Andres Freund <and...@anarazel.de> Date: Tue, 23 Dec 2014 21:30:14 +0100 Subject: [PATCH] Lockless StrategyGetBuffer clock sweep hotpath. StrategyGetBuffer() has proven to be a bottleneck in a number of buffer acquiration heavy workloads. To some degree this has already been aleviated by 5d7962c6, but it's still can be quite a heavy bottleneck. The problem is that in unfortunate usage patterns a single StrategyGetBuffer() call will have to look at a large number of buffers - in turn making it likely that the process will be put to sleep while still holding the spinlock. Replace most of the buffer_strategy_lock for the clock sweep by a atomic nextVictimBuffer variable. That variable, modulo NBuffers, is the current hand of the clock sweep. The buffer clocksweep only needs to acquire the spinlock after a wraparound. And even then, only in the process wrapping around. That aleviates nearly all the contention in the relevant spinlock. Reviewed-By: Robert Haas and Amit Kapila --- src/backend/postmaster/bgwriter.c | 4 +- src/backend/storage/buffer/freelist.c | 253 ++++++++++++++++++++++++---------- src/include/storage/buf_internals.h | 2 +- 3 files changed, 180 insertions(+), 79 deletions(-) diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c index 780ee3b..101854a 100644 --- a/src/backend/postmaster/bgwriter.c +++ b/src/backend/postmaster/bgwriter.c @@ -373,13 +373,13 @@ BackgroundWriterMain(void) if (rc == WL_TIMEOUT && can_hibernate && prev_hibernate) { /* Ask for notification at next buffer allocation */ - StrategyNotifyBgWriter(&MyProc->procLatch); + StrategyNotifyBgWriter(MyProc->pgprocno); /* Sleep ... */ rc = WaitLatch(&MyProc->procLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, BgWriterDelay * HIBERNATE_FACTOR); /* Reset the notification request in case we timed out */ - StrategyNotifyBgWriter(NULL); + StrategyNotifyBgWriter(-1); } /* diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 5966beb..eb01811 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -15,8 +15,12 @@ */ #include "postgres.h" +#include "port/atomics.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" +#include "storage/proc.h" + +#define INT_ACCESS_ONCE(var) ((int)(*((volatile int *)&(var)))) /* @@ -27,8 +31,12 @@ typedef struct /* Spinlock: protects the values below */ slock_t buffer_strategy_lock; - /* Clock sweep hand: index of next buffer to consider grabbing */ - int nextVictimBuffer; + /* + * Clock sweep hand: index of next buffer to consider grabbing. Note that + * this isn't a concrete buffer - we only ever increase the value. So, to + * get an actual buffer, it needs to be used modulo NBuffers. + */ + pg_atomic_uint32 nextVictimBuffer; int firstFreeBuffer; /* Head of list of unused buffers */ int lastFreeBuffer; /* Tail of list of unused buffers */ @@ -42,13 +50,14 @@ typedef struct * Statistics. These counters should be wide enough that they can't * overflow during a single bgwriter cycle. */ - uint32 completePasses; /* Complete cycles of the clock sweep */ - uint32 numBufferAllocs; /* Buffers allocated since last reset */ + uint32 completePasses; /* Complete cycles of the clock sweep */ + pg_atomic_uint32 numBufferAllocs; /* Buffers allocated since last reset */ /* - * Notification latch, or NULL if none. See StrategyNotifyBgWriter. + * Bgworker process to be notified upon activity or -1 if none. See + * StrategyNotifyBgWriter. */ - Latch *bgwriterLatch; + int bgwprocno; } BufferStrategyControl; /* Pointers to shared state */ @@ -93,6 +102,70 @@ static volatile BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy); static void AddBufferToRing(BufferAccessStrategy strategy, volatile BufferDesc *buf); +/* + * ClockSweepTick - Helper routine for StrategyGetBuffer() + * + * Move the clock hand one buffer ahead of its current position and return the + * id of the buffer now under the hand. + */ +static inline uint32 +ClockSweepTick(void) +{ + uint32 victim; + + /* + * Atomically move hand ahead one buffer - if there's several processes + * doing this, this can lead to buffers being returned slightly out of + * apparent order. + */ + victim = + pg_atomic_fetch_add_u32(&StrategyControl->nextVictimBuffer, 1); + + if (victim >= NBuffers) + { + uint32 originalVictim = victim; + + /* always wrap what we look up in BufferDescriptors */ + victim = victim % NBuffers; + + /* + * If we're the one that just caused a wraparound, force + * completePasses to be incremented while holding the spinlock. We + * need the spinlock so StrategySyncStart() can return a consistent + * value consisting of nextVictimBuffer and completePasses. + */ + if (victim == 0) + { + uint32 expected; + uint32 wrapped; + bool success = false; + + expected = originalVictim + 1; + + while (!success) + { + /* + * Acquire the spinlock while increasing completePasses. That + * allows other readers to read nextVictimBuffer and + * completePasses in a consistent manner which is required for + * StrategySyncStart(). In theory delaying the increment + * could lead to a overflow of nextVictimBuffers, but that's + * highly unlikely and wouldn't be particularly harmful. + */ + SpinLockAcquire(&StrategyControl->buffer_strategy_lock); + + wrapped = expected % NBuffers; + + success = pg_atomic_compare_exchange_u32(&StrategyControl->nextVictimBuffer, + &expected, wrapped); + if (success) + StrategyControl->completePasses++; + SpinLockRelease(&StrategyControl->buffer_strategy_lock); + } + } + } + return victim; +} /* * StrategyGetBuffer @@ -110,7 +183,7 @@ volatile BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy) { volatile BufferDesc *buf; - Latch *bgwriterLatch; + int bgwprocno; int trycounter; /* @@ -124,86 +197,107 @@ StrategyGetBuffer(BufferAccessStrategy strategy) return buf; } - /* Nope, so lock the freelist */ - SpinLockAcquire(&StrategyControl->buffer_strategy_lock); + /* + * If asked, we need to waken the bgwriter. Since we don't want to rely on + * a spinlock for this we force a read from shared memory once, and then + * set the latch based on that value. We need to go through that length + * because otherwise bgprocno might be reset while/after we check because + * the compiler might just reread from memory. + * + * This can possibly set the latch of the wrong process if the bgwriter + * dies in the wrong moment. But since PGPROC->procLatch is never + * deallocated the worst consequence of that is that we set the latch of + * some arbitrary process. + */ + bgwprocno = INT_ACCESS_ONCE(StrategyControl->bgwprocno); + if (bgwprocno != -1) + { + /* reset bgwprocno first, before setting the latch */ + StrategyControl->bgwprocno = -1; + pg_write_barrier(); + + /* + * Not acquiring ProcArrayLock here which is slightly icky. It's + * actually fine because procLatch isn't ever freed, so we just can + * potentially set the wrong process' (or no process') latch. + */ + SetLatch(&ProcGlobal->allProcs[bgwprocno].procLatch); + } /* * We count buffer allocation requests so that the bgwriter can estimate * the rate of buffer consumption. Note that buffers recycled by a * strategy object are intentionally not counted here. */ - StrategyControl->numBufferAllocs++; + pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1); /* - * If bgwriterLatch is set, we need to waken the bgwriter, but we should - * not do so while holding buffer_strategy_lock; so release and re-grab. - * This is annoyingly tedious, but it happens at most once per bgwriter - * cycle, so the performance hit is minimal. + * First check, without acquiring the lock, wether there's buffers in the + * freelist. Since we otherwise don't require the spinlock in every + * StrategyGetBuffer() invocation, it'd be sad to acquire it here - + * uselessly in most cases. That obviously leaves a race where a buffer is + * put on the freelist but we don't see the store yet - but that's pretty + * harmless, it'll just get used during the next buffer acquiration. + * + * If there's buffers on the freelist, acquire the spinlock to pop one + * buffer of the freelist. Then check whether that buffer is usable and + * repeat if not. + * + * Note that the freeNext fields are considered to be protected by the + * buffer_strategy_lock not the individual buffer spinlocks, so it's OK to + * manipulate them without holding the spinlock. */ - bgwriterLatch = StrategyControl->bgwriterLatch; - if (bgwriterLatch) + if (StrategyControl->firstFreeBuffer >= 0) { - StrategyControl->bgwriterLatch = NULL; - SpinLockRelease(&StrategyControl->buffer_strategy_lock); - SetLatch(bgwriterLatch); - SpinLockAcquire(&StrategyControl->buffer_strategy_lock); - } + while (true) + { + /* Acquire the spinlock to remove element from the freelist */ + SpinLockAcquire(&StrategyControl->buffer_strategy_lock); - /* - * Try to get a buffer from the freelist. Note that the freeNext fields - * are considered to be protected by the buffer_strategy_lock not the - * individual buffer spinlocks, so it's OK to manipulate them without - * holding the spinlock. - */ - while (StrategyControl->firstFreeBuffer >= 0) - { - buf = &BufferDescriptors[StrategyControl->firstFreeBuffer]; - Assert(buf->freeNext != FREENEXT_NOT_IN_LIST); + if (StrategyControl->firstFreeBuffer < 0) + { + SpinLockRelease(&StrategyControl->buffer_strategy_lock); + break; + } - /* Unconditionally remove buffer from freelist */ - StrategyControl->firstFreeBuffer = buf->freeNext; - buf->freeNext = FREENEXT_NOT_IN_LIST; + buf = &BufferDescriptors[StrategyControl->firstFreeBuffer]; + Assert(buf->freeNext != FREENEXT_NOT_IN_LIST); - /* - * Release the lock so someone else can access the freelist (or run - * the clocksweep) while we check out this buffer. - */ - SpinLockRelease(&StrategyControl->buffer_strategy_lock); + /* Unconditionally remove buffer from freelist */ + StrategyControl->firstFreeBuffer = buf->freeNext; + buf->freeNext = FREENEXT_NOT_IN_LIST; - /* - * If the buffer is pinned or has a nonzero usage_count, we cannot use - * it; discard it and retry. (This can only happen if VACUUM put a - * valid buffer in the freelist and then someone else used it before - * we got to it. It's probably impossible altogether as of 8.3, but - * we'd better check anyway.) - */ - LockBufHdr(buf); - if (buf->refcount == 0 && buf->usage_count == 0) - { - if (strategy != NULL) - AddBufferToRing(strategy, buf); - return buf; - } - UnlockBufHdr(buf); + /* + * Release the lock so someone else can access the freelist while + * we check out this buffer. + */ + SpinLockRelease(&StrategyControl->buffer_strategy_lock); - /* Reacquire the lock and go around for another pass. */ - SpinLockAcquire(&StrategyControl->buffer_strategy_lock); + /* + * If the buffer is pinned or has a nonzero usage_count, we cannot + * use it; discard it and retry. (This can only happen if VACUUM + * put a valid buffer in the freelist and then someone else used + * it before we got to it. It's probably impossible altogether as + * of 8.3, but we'd better check anyway.) + */ + LockBufHdr(buf); + if (buf->refcount == 0 && buf->usage_count == 0) + { + if (strategy != NULL) + AddBufferToRing(strategy, buf); + return buf; + } + UnlockBufHdr(buf); + + } } /* Nothing on the freelist, so run the "clock sweep" algorithm */ trycounter = NBuffers; for (;;) { - buf = &BufferDescriptors[StrategyControl->nextVictimBuffer]; - - if (++StrategyControl->nextVictimBuffer >= NBuffers) - { - StrategyControl->nextVictimBuffer = 0; - StrategyControl->completePasses++; - } - /* Release the lock before manipulating the candidate buffer. */ - SpinLockRelease(&StrategyControl->buffer_strategy_lock); + buf = &BufferDescriptors[ClockSweepTick()]; /* * If the buffer is pinned or has a nonzero usage_count, we cannot use @@ -238,9 +332,6 @@ StrategyGetBuffer(BufferAccessStrategy strategy) elog(ERROR, "no unpinned buffers available"); } UnlockBufHdr(buf); - - /* Reacquire the lock and get a new candidate buffer. */ - SpinLockAcquire(&StrategyControl->buffer_strategy_lock); } } @@ -281,16 +372,26 @@ StrategyFreeBuffer(volatile BufferDesc *buf) int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc) { + uint32 nextVictimBuffer; int result; SpinLockAcquire(&StrategyControl->buffer_strategy_lock); - result = StrategyControl->nextVictimBuffer; + nextVictimBuffer = pg_atomic_read_u32(&StrategyControl->nextVictimBuffer); + result = nextVictimBuffer % NBuffers; + if (complete_passes) + { *complete_passes = StrategyControl->completePasses; + /* + * Additionally add the number of wraparounds that happened before + * completePasses could be incremented. C.f. ClockSweepTick(). + */ + *complete_passes += nextVictimBuffer / NBuffers; + } + if (num_buf_alloc) { - *num_buf_alloc = StrategyControl->numBufferAllocs; - StrategyControl->numBufferAllocs = 0; + *num_buf_alloc = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocs, 0); } SpinLockRelease(&StrategyControl->buffer_strategy_lock); return result; @@ -305,7 +406,7 @@ StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc) * from hibernation, and is not meant for anybody else to use. */ void -StrategyNotifyBgWriter(Latch *bgwriterLatch) +StrategyNotifyBgWriter(int bgwprocno) { /* * We acquire buffer_strategy_lock just to ensure that the store appears @@ -313,7 +414,7 @@ StrategyNotifyBgWriter(Latch *bgwriterLatch) * infrequently, so there's no performance penalty from being safe. */ SpinLockAcquire(&StrategyControl->buffer_strategy_lock); - StrategyControl->bgwriterLatch = bgwriterLatch; + StrategyControl->bgwprocno = bgwprocno; SpinLockRelease(&StrategyControl->buffer_strategy_lock); } @@ -389,14 +490,14 @@ StrategyInitialize(bool init) StrategyControl->lastFreeBuffer = NBuffers - 1; /* Initialize the clock sweep pointer */ - StrategyControl->nextVictimBuffer = 0; + pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0); /* Clear statistics */ StrategyControl->completePasses = 0; - StrategyControl->numBufferAllocs = 0; + pg_atomic_init_u32(&StrategyControl->numBufferAllocs, 0); /* No pending notification */ - StrategyControl->bgwriterLatch = NULL; + StrategyControl->bgwprocno = -1; } else Assert(!init); diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index 0e69b63..7cf37b6 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -191,7 +191,7 @@ extern bool StrategyRejectBuffer(BufferAccessStrategy strategy, volatile BufferDesc *buf); extern int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc); -extern void StrategyNotifyBgWriter(Latch *bgwriterLatch); +extern void StrategyNotifyBgWriter(int bgwprocno); extern Size StrategyShmemSize(void); extern void StrategyInitialize(bool init); -- 2.2.0.rc0.18.ga1ad247
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers