On 2016-03-28 15:46:43 +0300, Alexander Korotkov wrote: > diff --git a/src/backend/storage/buffer/bufmnew file mode 100644 > index 6dd7c6e..fe6fb9c > --- a/src/backend/storage/buffer/bufmgr.c > +++ b/src/backend/storage/buffer/bufmgr.c > @@ -52,7 +52,6 @@ > #include "utils/resowner_private.h" > #include "utils/timestamp.h" > > - > /* Note: these two macros only work on shared buffers, not local ones! */ > #define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) > (bufHdr)->buf_id) * BLCKSZ)) > #define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
spurious > @@ -848,7 +852,7 @@ ReadBuffer_common(SMgrRelation smgr, cha > * it's not been recycled) but come right back here to try smgrextend > * again. > */ > - Assert(!(bufHdr->flags & BM_VALID)); /* spinlock not needed > */ > + Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* header > lock not needed */ > > bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : > BufHdrGetBlock(bufHdr); > > @@ -932,8 +936,13 @@ ReadBuffer_common(SMgrRelation smgr, cha > > if (isLocalBuf) > { > - /* Only need to adjust flags */ > - bufHdr->flags |= BM_VALID; > + /* > + * Only need to adjust flags. Since it's local buffer, there > is no > + * concurrency. We assume read/write pair to be cheaper than > atomic OR. > + */ > + uint32 state = pg_atomic_read_u32(&bufHdr->state); > + state |= BM_VALID; > + pg_atomic_write_u32(&bufHdr->state, state); I'm not a fan of repeating that comment in multiple places. Hm. > } > else > { > @@ -987,10 +996,11 @@ BufferAlloc(SMgrRelation smgr, char relp > BufferTag oldTag; /* previous identity of > selected buffer */ > uint32 oldHash; /* hash value for oldTag */ > LWLock *oldPartitionLock; /* buffer partition lock for it > */ > - BufFlags oldFlags; > + uint32 oldFlags; > int buf_id; > BufferDesc *buf; > bool valid; > + uint32 state; > > /* create a tag so we can lookup the buffer */ > INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum); > @@ -1050,23 +1060,23 @@ BufferAlloc(SMgrRelation smgr, char relp > for (;;) > { > /* > - * Ensure, while the spinlock's not yet held, that there's a > free > + * Ensure, while the header lock isn't yet held, that there's a > free > * refcount entry. > */ > ReservePrivateRefCountEntry(); > > /* > * Select a victim buffer. The buffer is returned with its > header > - * spinlock still held! > + * lock still held! > */ > - buf = StrategyGetBuffer(strategy); > + buf = StrategyGetBuffer(strategy, &state); The new thing really still is a spinlock, not sure if it's worth changing the comments that way. > @@ -1319,7 +1330,7 @@ BufferAlloc(SMgrRelation smgr, char relp > * InvalidateBuffer -- mark a shared buffer invalid and return it to the > * freelist. > * > - * The buffer header spinlock must be held at entry. We drop it before > + * The buffer header lock must be held at entry. We drop it before > * returning. (This is sane because the caller must have locked the > * buffer in order to be sure it should be dropped.) > * Ok, by now I'm pretty sure that I don't want this to be changed everywhere, just makes reviewing harder. > @@ -1433,6 +1443,7 @@ void > MarkBufferDirty(Buffer buffer) > { > BufferDesc *bufHdr; > + uint32 state; > > if (!BufferIsValid(buffer)) > elog(ERROR, "bad buffer ID: %d", buffer); > @@ -1449,14 +1460,14 @@ MarkBufferDirty(Buffer buffer) > /* unfortunately we can't check if the lock is held exclusively */ > Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr))); > > - LockBufHdr(bufHdr); > + state = LockBufHdr(bufHdr); > > - Assert(bufHdr->refcount > 0); > + Assert(BUF_STATE_GET_REFCOUNT(state) > 0); > > /* > * If the buffer was not dirty already, do vacuum accounting. > */ > - if (!(bufHdr->flags & BM_DIRTY)) > + if (!(state & BM_DIRTY)) > { > VacuumPageDirty++; > pgBufferUsage.shared_blks_dirtied++; > @@ -1464,9 +1475,9 @@ MarkBufferDirty(Buffer buffer) > VacuumCostBalance += VacuumCostPageDirty; > } > > - bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED); > - > - UnlockBufHdr(bufHdr); > + state |= BM_DIRTY | BM_JUST_DIRTIED; > + state &= ~BM_LOCKED; > + pg_atomic_write_u32(&bufHdr->state, state); > } Hm, this is a routine I think we should avoid taking the lock in; it's often called quite frequently. Also doesn't look very hard. > static bool > PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy) > @@ -1547,23 +1563,44 @@ PinBuffer(BufferDesc *buf, BufferAccessS > > if (ref == NULL) > { > + /* loop of CAS operations */ > + uint32 state; > + uint32 oldstate; > + SpinDelayStatus delayStatus; > ReservePrivateRefCountEntry(); > ref = NewPrivateRefCountEntry(b); > > - LockBufHdr(buf); > - buf->refcount++; > - if (strategy == NULL) > - { > - if (buf->usage_count < BM_MAX_USAGE_COUNT) > - buf->usage_count++; > - } > - else > + state = pg_atomic_read_u32(&buf->state); > + oldstate = state; > + > + init_spin_delay(&delayStatus, (Pointer)buf, __FILE__, __LINE__); Hm. This way we're calling this on every iteration. That doesn't seem like a good idea. How about making delayStatus a static, and init_spin_delay a macro which returns a {struct, member, one, two} type literal? > + while (true) > { > - if (buf->usage_count == 0) > - buf->usage_count = 1; > + /* spin-wait till lock is free */ > + while (state & BM_LOCKED) > + { > + make_spin_delay(&delayStatus); > + state = pg_atomic_read_u32(&buf->state); > + oldstate = state; > + } Maybe we should abstract that to pg_atomic_wait_bit_unset_u32()? It seems quite likely we need this in other places (e.g. lwlock.c itself). > + /* increase refcount */ > + state += BUF_REFCOUNT_ONE; > + > + /* increase usagecount unless already max */ > + if (BUF_STATE_GET_USAGECOUNT(state) != > BM_MAX_USAGE_COUNT) > + state += BUF_USAGECOUNT_ONE; > + > + /* try to do CAS, exit on success */ Seems like a somewhat obvious comment? > @@ -1603,15 +1640,22 @@ PinBuffer_Locked(BufferDesc *buf) > { > Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) > == NULL); > > - buf->refcount++; > - UnlockBufHdr(buf); > + /* > + * Since we assume to held buffer header lock, we can update the buffer > + * state in a single write operation. > + */ > + state = pg_atomic_read_u32(&buf->state); > + state += 1; > + state &= ~BM_LOCKED; > + pg_atomic_write_u32(&buf->state, state); Te comment should probably mention that we're releasing the spinlock. And the += 1 should be a BUF_REFCOUNT_ONE, otherwise it's hard to understand. > @@ -1646,30 +1690,66 @@ UnpinBuffer(BufferDesc *buf, bool fixOwn > ref->refcount--; > if (ref->refcount == 0) > { > + > + /* Support LockBufferForCleanup() */ > + if (state & BM_PIN_COUNT_WAITER) > + { > + state = LockBufHdr(buf); > + > + if (state & BM_PIN_COUNT_WAITER && > BUF_STATE_GET_REFCOUNT(state) == 1) > + { > + /* we just released the last pin other than the > waiter's */ > + int wait_backend_pid = > buf->wait_backend_pid; > > + state &= ~(BM_PIN_COUNT_WAITER | BM_LOCKED); > + pg_atomic_write_u32(&buf->state, state); > + ProcSendSignal(wait_backend_pid); > + } > + else > + UnlockBufHdr(buf); > + } I think it's quite confusing to use UnlockBufHdr and direct bit expressions in one branch. Thinking about it I also don't think the pg_atomic_write_u32 variant is correct without adding a write barrier; the other side might not see the values yet. I think we can just redefine UnlockBufHdr() to be a pg_atomic_write_u32() and pg_write_memory_barrier()? > * BM_PERMANENT can't be changed while we hold a pin on the buffer, so > we > - * need not bother with the buffer header spinlock. Even if someone > else > + * need not bother with the buffer header lock. Even if someone else > * changes the buffer header flags while we're doing this, we assume > that > * changing an aligned 2-byte BufFlags value is atomic, so we'll read > the > * old value or the new value, but not random garbage. > */ The rest of the comment is outdated, BufFlags isn't a 2 byte value anymore. > @@ -3078,7 +3168,7 @@ FlushRelationBuffers(Relation rel) > localpage, > false); > > - bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED); > + pg_atomic_fetch_and_u32(&bufHdr->state, > ~(BM_DIRTY | BM_JUST_DIRTIED)); Hm, in other places you replaced atomics on local buffers with plain writes. > lsn = XLogSaveBufferForHint(buffer, buffer_std); > } > > - LockBufHdr(bufHdr); > - Assert(bufHdr->refcount > 0); > - if (!(bufHdr->flags & BM_DIRTY)) > + state = LockBufHdr(bufHdr); > + > + Assert(BUF_STATE_GET_REFCOUNT(state) > 0); > + > + if (!(state & BM_DIRTY)) > { > dirtied = true; /* Means "will be dirtied by > this action" */ It's again worthwhile to try make this work without taking the lock. > - buf->flags |= BM_IO_IN_PROGRESS; > - > - UnlockBufHdr(buf); > + state |= BM_IO_IN_PROGRESS; > + state &= ~BM_LOCKED; > + pg_atomic_write_u32(&buf->state, state); How about making UnlockBufHdr() take a new state parameter, and internally unset BM_LOCKED? > /* > + * Lock buffer header - set BM_LOCKED in buffer state. > + */ > +uint32 > +LockBufHdr(volatile BufferDesc *desc) > +{ > + SpinDelayStatus delayStatus; > + uint32 state; > + > + init_spin_delay(&delayStatus, (Pointer)desc, __FILE__, __LINE__); > + > + state = pg_atomic_read_u32(&desc->state); > + > + for (;;) > + { > + /* wait till lock is free */ > + while (state & BM_LOCKED) > + { > + make_spin_delay(&delayStatus); > + state = pg_atomic_read_u32(&desc->state); > + /* Add exponential backoff? Should seldomly be > contended tho. */ Outdated comment. > +/* > + * Unlock buffer header - unset BM_LOCKED in buffer state. > + */ > +void > +UnlockBufHdr(volatile BufferDesc *desc) > +{ > + Assert(pg_atomic_read_u32(&desc->state) & BM_LOCKED); > + > + pg_atomic_sub_fetch_u32(&desc->state, BM_LOCKED); > +} As suggested above, there's likely no need to use an actual atomic op here. Greetings, Andres Freund -- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers