Re: [HACKERS] Move PinBuffer and UnpinBuffer to atomics

Andres Freund Mon, 28 Mar 2016 07:17:00 -0700

On 2016-03-28 15:46:43 +0300, Alexander Korotkov wrote:
> diff --git a/src/backend/storage/buffer/bufmnew file mode 100644
> index 6dd7c6e..fe6fb9c
> --- a/src/backend/storage/buffer/bufmgr.c
> +++ b/src/backend/storage/buffer/bufmgr.c
> @@ -52,7 +52,6 @@
>  #include "utils/resowner_private.h"
>  #include "utils/timestamp.h"
>  
> -
>  /* Note: these two macros only work on shared buffers, not local ones! */
>  #define BufHdrGetBlock(bufHdr)       ((Block) (BufferBlocks + ((Size) 
> (bufHdr)->buf_id) * BLCKSZ))
>  #define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))


spurious

> @@ -848,7 +852,7 @@ ReadBuffer_common(SMgrRelation smgr, cha
>        * it's not been recycled) but come right back here to try smgrextend
>        * again.
>        */
> -     Assert(!(bufHdr->flags & BM_VALID));            /* spinlock not needed 
> */
> +     Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* header 
> lock not needed */
>  
>       bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : 
> BufHdrGetBlock(bufHdr);
>  
> @@ -932,8 +936,13 @@ ReadBuffer_common(SMgrRelation smgr, cha
>  
>       if (isLocalBuf)
>       {
> -             /* Only need to adjust flags */
> -             bufHdr->flags |= BM_VALID;
> +             /*
> +              * Only need to adjust flags.  Since it's local buffer, there 
> is no
> +              * concurrency.  We assume read/write pair to be cheaper than 
> atomic OR.
> +              */
> +             uint32 state = pg_atomic_read_u32(&bufHdr->state);
> +             state |= BM_VALID;
> +             pg_atomic_write_u32(&bufHdr->state, state);

I'm not a fan of repeating that comment in multiple places. Hm.


>       }
>       else
>       {
> @@ -987,10 +996,11 @@ BufferAlloc(SMgrRelation smgr, char relp
>       BufferTag       oldTag;                 /* previous identity of 
> selected buffer */
>       uint32          oldHash;                /* hash value for oldTag */
>       LWLock     *oldPartitionLock;           /* buffer partition lock for it 
> */
> -     BufFlags        oldFlags;
> +     uint32          oldFlags;
>       int                     buf_id;
>       BufferDesc *buf;
>       bool            valid;
> +     uint32          state;
>  
>       /* create a tag so we can lookup the buffer */
>       INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
> @@ -1050,23 +1060,23 @@ BufferAlloc(SMgrRelation smgr, char relp
>       for (;;)
>       {
>               /*
> -              * Ensure, while the spinlock's not yet held, that there's a 
> free
> +              * Ensure, while the header lock isn't yet held, that there's a 
> free
>                * refcount entry.
>                */
>               ReservePrivateRefCountEntry();
>  
>               /*
>                * Select a victim buffer.  The buffer is returned with its 
> header
> -              * spinlock still held!
> +              * lock still held!
>                */
> -             buf = StrategyGetBuffer(strategy);
> +             buf = StrategyGetBuffer(strategy, &state);

The new thing really still is a spinlock, not sure if it's worth
changing the comments that way.


> @@ -1319,7 +1330,7 @@ BufferAlloc(SMgrRelation smgr, char relp
>   * InvalidateBuffer -- mark a shared buffer invalid and return it to the
>   * freelist.
>   *
> - * The buffer header spinlock must be held at entry.  We drop it before
> + * The buffer header lock must be held at entry.  We drop it before
>   * returning.  (This is sane because the caller must have locked the
>   * buffer in order to be sure it should be dropped.)
>   *

Ok, by now I'm pretty sure that I don't want this to be changed
everywhere, just makes reviewing harder.



> @@ -1433,6 +1443,7 @@ void
>  MarkBufferDirty(Buffer buffer)
>  {
>       BufferDesc *bufHdr;
> +     uint32          state;
>  
>       if (!BufferIsValid(buffer))
>               elog(ERROR, "bad buffer ID: %d", buffer);
> @@ -1449,14 +1460,14 @@ MarkBufferDirty(Buffer buffer)
>       /* unfortunately we can't check if the lock is held exclusively */
>       Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
>  
> -     LockBufHdr(bufHdr);
> +     state = LockBufHdr(bufHdr);
>  
> -     Assert(bufHdr->refcount > 0);
> +     Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
>  
>       /*
>        * If the buffer was not dirty already, do vacuum accounting.
>        */
> -     if (!(bufHdr->flags & BM_DIRTY))
> +     if (!(state & BM_DIRTY))
>       {
>               VacuumPageDirty++;
>               pgBufferUsage.shared_blks_dirtied++;
> @@ -1464,9 +1475,9 @@ MarkBufferDirty(Buffer buffer)
>                       VacuumCostBalance += VacuumCostPageDirty;
>       }
>  
> -     bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
> -
> -     UnlockBufHdr(bufHdr);
> +     state |= BM_DIRTY | BM_JUST_DIRTIED;
> +     state &= ~BM_LOCKED;
> +     pg_atomic_write_u32(&bufHdr->state, state);
>  }

Hm, this is a routine I think we should avoid taking the lock in; it's
often called quite frequently. Also doesn't look very hard.

>  static bool
>  PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
> @@ -1547,23 +1563,44 @@ PinBuffer(BufferDesc *buf, BufferAccessS
>  
>       if (ref == NULL)
>       {
> +             /* loop of CAS operations */
> +             uint32                  state;
> +             uint32                  oldstate;
> +             SpinDelayStatus delayStatus;



>               ReservePrivateRefCountEntry();
>               ref = NewPrivateRefCountEntry(b);
>  
> -             LockBufHdr(buf);
> -             buf->refcount++;
> -             if (strategy == NULL)
> -             {
> -                     if (buf->usage_count < BM_MAX_USAGE_COUNT)
> -                             buf->usage_count++;
> -             }
> -             else
> +             state = pg_atomic_read_u32(&buf->state);
> +             oldstate = state;
> +
> +             init_spin_delay(&delayStatus, (Pointer)buf, __FILE__, __LINE__);

Hm. This way we're calling this on every iteration. That doesn't seem
like a good idea. How about making delayStatus a static, and
init_spin_delay a macro which returns a {struct, member, one, two} type
literal?


> +             while (true)
>               {
> -                     if (buf->usage_count == 0)
> -                             buf->usage_count = 1;
> +                     /* spin-wait till lock is free */
> +                     while (state & BM_LOCKED)
> +                     {
> +                             make_spin_delay(&delayStatus);
> +                             state = pg_atomic_read_u32(&buf->state);
> +                             oldstate = state;
> +                     }

Maybe we should abstract that to pg_atomic_wait_bit_unset_u32()? It
seems quite likely we need this in other places (e.g. lwlock.c itself).


> +                     /* increase refcount */
> +                     state += BUF_REFCOUNT_ONE;
> +
> +                     /* increase usagecount unless already max */
> +                     if (BUF_STATE_GET_USAGECOUNT(state) != 
> BM_MAX_USAGE_COUNT)
> +                             state += BUF_USAGECOUNT_ONE;
> +
> +                     /* try to do CAS, exit on success */

Seems like a somewhat obvious comment?


> @@ -1603,15 +1640,22 @@ PinBuffer_Locked(BufferDesc *buf)
>  {

>       Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) 
> == NULL);
>  
> -     buf->refcount++;
> -     UnlockBufHdr(buf);
> +     /*
> +      * Since we assume to held buffer header lock, we can update the buffer
> +      * state in a single write operation.
> +      */
> +     state = pg_atomic_read_u32(&buf->state);
> +     state += 1;
> +     state &= ~BM_LOCKED;
> +     pg_atomic_write_u32(&buf->state, state);

Te comment should probably mention that we're releasing the
spinlock. And the += 1 should be a BUF_REFCOUNT_ONE, otherwise it's hard
to understand.

> @@ -1646,30 +1690,66 @@ UnpinBuffer(BufferDesc *buf, bool fixOwn
>       ref->refcount--;
>       if (ref->refcount == 0)
>       {

> +
> +             /* Support LockBufferForCleanup() */
> +             if (state & BM_PIN_COUNT_WAITER)
> +             {
> +                     state = LockBufHdr(buf);
> +
> +                     if (state & BM_PIN_COUNT_WAITER && 
> BUF_STATE_GET_REFCOUNT(state) == 1)
> +                     {
> +                             /* we just released the last pin other than the 
> waiter's */
> +                             int                     wait_backend_pid = 
> buf->wait_backend_pid;
>  
> +                             state &= ~(BM_PIN_COUNT_WAITER | BM_LOCKED);
> +                             pg_atomic_write_u32(&buf->state, state);
> +                             ProcSendSignal(wait_backend_pid);
> +                     }
> +                     else
> +                             UnlockBufHdr(buf);
> +             }

I think it's quite confusing to use UnlockBufHdr and direct bit
expressions in one branch.

Thinking about it I also don't think the pg_atomic_write_u32 variant is
correct without adding a write barrier; the other side might not see the
values yet.

I think we can just redefine UnlockBufHdr() to be a
pg_atomic_write_u32() and pg_write_memory_barrier()?



>        * BM_PERMANENT can't be changed while we hold a pin on the buffer, so 
> we
> -      * need not bother with the buffer header spinlock.  Even if someone 
> else
> +      * need not bother with the buffer header lock.  Even if someone else
>        * changes the buffer header flags while we're doing this, we assume 
> that
>        * changing an aligned 2-byte BufFlags value is atomic, so we'll read 
> the
>        * old value or the new value, but not random garbage.
>        */

The rest of the comment is outdated, BufFlags isn't a 2 byte value
anymore.

> @@ -3078,7 +3168,7 @@ FlushRelationBuffers(Relation rel)
>                                                 localpage,
>                                                 false);
>  
> -                             bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
> +                             pg_atomic_fetch_and_u32(&bufHdr->state, 
> ~(BM_DIRTY | BM_JUST_DIRTIED));

Hm, in other places you replaced atomics on local buffers with plain
writes.

>                       lsn = XLogSaveBufferForHint(buffer, buffer_std);
>               }
>  
> -             LockBufHdr(bufHdr);
> -             Assert(bufHdr->refcount > 0);
> -             if (!(bufHdr->flags & BM_DIRTY))
> +             state = LockBufHdr(bufHdr);
> +
> +             Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
> +
> +             if (!(state & BM_DIRTY))
>               {
>                       dirtied = true;         /* Means "will be dirtied by 
> this action" */

It's again worthwhile to try make this work without taking the lock.


> -     buf->flags |= BM_IO_IN_PROGRESS;
> -
> -     UnlockBufHdr(buf);
> +     state |= BM_IO_IN_PROGRESS;
> +     state &= ~BM_LOCKED;
> +     pg_atomic_write_u32(&buf->state, state);

How about making UnlockBufHdr() take a new state parameter, and
internally unset BM_LOCKED?


>  /*
> + * Lock buffer header - set BM_LOCKED in buffer state.
> + */
> +uint32
> +LockBufHdr(volatile BufferDesc *desc)
> +{
> +     SpinDelayStatus delayStatus;
> +     uint32                  state;
> +
> +     init_spin_delay(&delayStatus, (Pointer)desc, __FILE__, __LINE__);
> +
> +     state = pg_atomic_read_u32(&desc->state);
> +
> +     for (;;)
> +     {
> +             /* wait till lock is free */
> +             while (state & BM_LOCKED)
> +             {
> +                     make_spin_delay(&delayStatus);
> +                     state = pg_atomic_read_u32(&desc->state);
> +                     /* Add exponential backoff? Should seldomly be 
> contended tho. */

Outdated comment.


> +/*
> + * Unlock buffer header - unset BM_LOCKED in buffer state.
> + */
> +void
> +UnlockBufHdr(volatile BufferDesc *desc)
> +{
> +     Assert(pg_atomic_read_u32(&desc->state) & BM_LOCKED);
> +
> +     pg_atomic_sub_fetch_u32(&desc->state, BM_LOCKED);
> +}

As suggested above, there's likely no need to use an actual atomic op
here.

Greetings,

Andres Freund


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] Move PinBuffer and UnpinBuffer to atomics

Reply via email to