diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
new file mode 100644
index 6622d22..b583049
*** a/contrib/pg_buffercache/pg_buffercache_pages.c
--- b/contrib/pg_buffercache/pg_buffercache_pages.c
*************** pg_buffercache_pages(PG_FUNCTION_ARGS)
*** 149,158 ****
  		for (i = 0; i < NBuffers; i++)
  		{
  			volatile BufferDesc *bufHdr;
  
  			bufHdr = GetBufferDescriptor(i);
  			/* Lock each buffer header before inspecting. */
! 			LockBufHdr(bufHdr);
  
  			fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
  			fctx->record[i].relfilenode = bufHdr->tag.rnode.relNode;
--- 149,159 ----
  		for (i = 0; i < NBuffers; i++)
  		{
  			volatile BufferDesc *bufHdr;
+ 			uint32 state;
  
  			bufHdr = GetBufferDescriptor(i);
  			/* Lock each buffer header before inspecting. */
! 			state = LockBufHdr(bufHdr);
  
  			fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
  			fctx->record[i].relfilenode = bufHdr->tag.rnode.relNode;
*************** pg_buffercache_pages(PG_FUNCTION_ARGS)
*** 160,180 ****
  			fctx->record[i].reldatabase = bufHdr->tag.rnode.dbNode;
  			fctx->record[i].forknum = bufHdr->tag.forkNum;
  			fctx->record[i].blocknum = bufHdr->tag.blockNum;
! 			fctx->record[i].usagecount = bufHdr->usage_count;
! 			fctx->record[i].pinning_backends = bufHdr->refcount;
  
! 			if (bufHdr->flags & BM_DIRTY)
  				fctx->record[i].isdirty = true;
  			else
  				fctx->record[i].isdirty = false;
  
  			/* Note if the buffer is valid, and has storage created */
! 			if ((bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_TAG_VALID))
  				fctx->record[i].isvalid = true;
  			else
  				fctx->record[i].isvalid = false;
  
! 			UnlockBufHdr(bufHdr);
  		}
  
  		/*
--- 161,181 ----
  			fctx->record[i].reldatabase = bufHdr->tag.rnode.dbNode;
  			fctx->record[i].forknum = bufHdr->tag.forkNum;
  			fctx->record[i].blocknum = bufHdr->tag.blockNum;
! 			fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(state);
! 			fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(state);
  
! 			if (state & BM_DIRTY)
  				fctx->record[i].isdirty = true;
  			else
  				fctx->record[i].isdirty = false;
  
  			/* Note if the buffer is valid, and has storage created */
! 			if ((state & BM_VALID) && (state & BM_TAG_VALID))
  				fctx->record[i].isvalid = true;
  			else
  				fctx->record[i].isvalid = false;
  
! 			UnlockBufHdr(bufHdr, state);
  		}
  
  		/*
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
new file mode 100644
index bfa37f1..a5cffc7
*** a/src/backend/storage/buffer/buf_init.c
--- b/src/backend/storage/buffer/buf_init.c
*************** InitBufferPool(void)
*** 135,146 ****
  			BufferDesc *buf = GetBufferDescriptor(i);
  
  			CLEAR_BUFFERTAG(buf->tag);
- 			buf->flags = 0;
- 			buf->usage_count = 0;
- 			buf->refcount = 0;
- 			buf->wait_backend_pid = 0;
  
! 			SpinLockInit(&buf->buf_hdr_lock);
  
  			buf->buf_id = i;
  
--- 135,143 ----
  			BufferDesc *buf = GetBufferDescriptor(i);
  
  			CLEAR_BUFFERTAG(buf->tag);
  
! 			pg_atomic_init_u32(&buf->state, 0);
! 			buf->wait_backend_pid = 0;
  
  			buf->buf_id = i;
  
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
new file mode 100644
index 6dd7c6e..4c85419
*** a/src/backend/storage/buffer/bufmgr.c
--- b/src/backend/storage/buffer/bufmgr.c
*************** ForgetPrivateRefCountEntry(PrivateRefCou
*** 427,432 ****
--- 427,464 ----
  	(GetPrivateRefCount(bufnum) > 0) \
  )
  
+ /*
+  * The following two macros are aimed to simplify buffer state modification
+  * in CAS loop.  It's assumed that variable "uint32 state" is defined outside
+  * of this loop.  It should be used as following:
+  *
+  * BEGIN_BUFSTATE_CAS_LOOP(bufHdr);
+  * modifications of state variable;
+  * END_BUFSTATE_CAS_LOOP(bufHdr);
+  *
+  * For local buffers, these macros shouldn't be used..  Since there is
+  * no cuncurrency, local buffer state could be chaged directly by atomic
+  * read/write operations.
+  */
+ #define BEGIN_BUFSTATE_CAS_LOOP(bufHdr) \
+ 	do { \
+ 		SpinDelayStatus	delayStatus = init_spin_delay((Pointer)(bufHdr)); \
+ 		uint32			oldstate; \
+ 		oldstate = pg_atomic_read_u32(&(bufHdr)->state); \
+ 		for (;;) { \
+ 			while (oldstate & BM_LOCKED) \
+ 			{ \
+ 				perform_spin_delay(&delayStatus); \
+ 				oldstate = pg_atomic_read_u32(&(bufHdr)->state); \
+ 			} \
+ 			state = oldstate
+ 
+ #define END_BUFSTATE_CAS_LOOP(bufHdr) \
+ 			if (pg_atomic_compare_exchange_u32(&bufHdr->state, &oldstate, state)) \
+ 				break; \
+ 		} \
+ 		finish_spin_delay(&delayStatus); \
+ 	} while (0)
  
  static Buffer ReadBuffer_common(SMgrRelation reln, char relpersistence,
  				  ForkNumber forkNum, BlockNumber blockNum,
*************** static int	SyncOneBuffer(int buf_id, boo
*** 440,446 ****
  static void WaitIO(BufferDesc *buf);
  static bool StartBufferIO(BufferDesc *buf, bool forInput);
  static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
! 				  int set_flag_bits);
  static void shared_buffer_write_error_callback(void *arg);
  static void local_buffer_write_error_callback(void *arg);
  static BufferDesc *BufferAlloc(SMgrRelation smgr,
--- 472,478 ----
  static void WaitIO(BufferDesc *buf);
  static bool StartBufferIO(BufferDesc *buf, bool forInput);
  static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
! 				  uint32 set_flag_bits);
  static void shared_buffer_write_error_callback(void *arg);
  static void local_buffer_write_error_callback(void *arg);
  static BufferDesc *BufferAlloc(SMgrRelation smgr,
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 816,823 ****
  		if (isLocalBuf)
  		{
  			/* Only need to adjust flags */
! 			Assert(bufHdr->flags & BM_VALID);
! 			bufHdr->flags &= ~BM_VALID;
  		}
  		else
  		{
--- 848,857 ----
  		if (isLocalBuf)
  		{
  			/* Only need to adjust flags */
! 			uint32 state = pg_atomic_read_u32(&bufHdr->state);
! 			Assert(state & BM_VALID);
! 			state &= ~BM_VALID;
! 			pg_atomic_write_u32(&bufHdr->state, state);
  		}
  		else
  		{
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 828,837 ****
  			 */
  			do
  			{
! 				LockBufHdr(bufHdr);
! 				Assert(bufHdr->flags & BM_VALID);
! 				bufHdr->flags &= ~BM_VALID;
! 				UnlockBufHdr(bufHdr);
  			} while (!StartBufferIO(bufHdr, true));
  		}
  	}
--- 862,871 ----
  			 */
  			do
  			{
! 				uint32 state = LockBufHdr(bufHdr);
! 				Assert(state & BM_VALID);
! 				state &= ~BM_VALID;
! 				UnlockBufHdr(bufHdr, state);
  			} while (!StartBufferIO(bufHdr, true));
  		}
  	}
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 848,854 ****
  	 * it's not been recycled) but come right back here to try smgrextend
  	 * again.
  	 */
! 	Assert(!(bufHdr->flags & BM_VALID));		/* spinlock not needed */
  
  	bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
  
--- 882,888 ----
  	 * it's not been recycled) but come right back here to try smgrextend
  	 * again.
  	 */
! 	Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */
  
  	bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
  
*************** ReadBuffer_common(SMgrRelation smgr, cha
*** 933,939 ****
  	if (isLocalBuf)
  	{
  		/* Only need to adjust flags */
! 		bufHdr->flags |= BM_VALID;
  	}
  	else
  	{
--- 967,975 ----
  	if (isLocalBuf)
  	{
  		/* Only need to adjust flags */
! 		uint32 state = pg_atomic_read_u32(&bufHdr->state);
! 		state |= BM_VALID;
! 		pg_atomic_write_u32(&bufHdr->state, state);
  	}
  	else
  	{
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 987,996 ****
  	BufferTag	oldTag;			/* previous identity of selected buffer */
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	BufFlags	oldFlags;
  	int			buf_id;
  	BufferDesc *buf;
  	bool		valid;
  
  	/* create a tag so we can lookup the buffer */
  	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
--- 1023,1033 ----
  	BufferTag	oldTag;			/* previous identity of selected buffer */
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	uint32		oldFlags;
  	int			buf_id;
  	BufferDesc *buf;
  	bool		valid;
+ 	uint32		state;
  
  	/* create a tag so we can lookup the buffer */
  	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1059,1070 ****
  		 * Select a victim buffer.  The buffer is returned with its header
  		 * spinlock still held!
  		 */
! 		buf = StrategyGetBuffer(strategy);
  
! 		Assert(buf->refcount == 0);
  
  		/* Must copy buffer flags while we still hold the spinlock */
! 		oldFlags = buf->flags;
  
  		/* Pin the buffer and then release the buffer spinlock */
  		PinBuffer_Locked(buf);
--- 1096,1107 ----
  		 * Select a victim buffer.  The buffer is returned with its header
  		 * spinlock still held!
  		 */
! 		buf = StrategyGetBuffer(strategy, &state);
  
! 		Assert(BUF_STATE_GET_REFCOUNT(state) == 0);
  
  		/* Must copy buffer flags while we still hold the spinlock */
! 		oldFlags = state & BUF_FLAG_MASK;
  
  		/* Pin the buffer and then release the buffer spinlock */
  		PinBuffer_Locked(buf);
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1106,1116 ****
  				if (strategy != NULL)
  				{
  					XLogRecPtr	lsn;
  
  					/* Read the LSN while holding buffer header lock */
! 					LockBufHdr(buf);
  					lsn = BufferGetLSN(buf);
! 					UnlockBufHdr(buf);
  
  					if (XLogNeedsFlush(lsn) &&
  						StrategyRejectBuffer(strategy, buf))
--- 1143,1154 ----
  				if (strategy != NULL)
  				{
  					XLogRecPtr	lsn;
+ 					uint32		state;
  
  					/* Read the LSN while holding buffer header lock */
! 					state = LockBufHdr(buf);
  					lsn = BufferGetLSN(buf);
! 					UnlockBufHdr(buf, state);
  
  					if (XLogNeedsFlush(lsn) &&
  						StrategyRejectBuffer(strategy, buf))
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1254,1260 ****
  		/*
  		 * Need to lock the buffer header too in order to change its tag.
  		 */
! 		LockBufHdr(buf);
  
  		/*
  		 * Somebody could have pinned or re-dirtied the buffer while we were
--- 1292,1298 ----
  		/*
  		 * Need to lock the buffer header too in order to change its tag.
  		 */
! 		state = LockBufHdr(buf);
  
  		/*
  		 * Somebody could have pinned or re-dirtied the buffer while we were
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1262,1272 ****
  		 * recycle this buffer; we must undo everything we've done and start
  		 * over with a new victim buffer.
  		 */
! 		oldFlags = buf->flags;
! 		if (buf->refcount == 1 && !(oldFlags & BM_DIRTY))
  			break;
  
! 		UnlockBufHdr(buf);
  		BufTableDelete(&newTag, newHash);
  		if ((oldFlags & BM_TAG_VALID) &&
  			oldPartitionLock != newPartitionLock)
--- 1300,1310 ----
  		 * recycle this buffer; we must undo everything we've done and start
  		 * over with a new victim buffer.
  		 */
! 		oldFlags = state & BUF_FLAG_MASK;
! 		if (BUF_STATE_GET_REFCOUNT(state) == 1 && !(oldFlags & BM_DIRTY))
  			break;
  
! 		UnlockBufHdr(buf, state);
  		BufTableDelete(&newTag, newHash);
  		if ((oldFlags & BM_TAG_VALID) &&
  			oldPartitionLock != newPartitionLock)
*************** BufferAlloc(SMgrRelation smgr, char relp
*** 1284,1297 ****
  	 * 1 so that the buffer can survive one clock-sweep pass.)
  	 */
  	buf->tag = newTag;
! 	buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT);
  	if (relpersistence == RELPERSISTENCE_PERMANENT)
! 		buf->flags |= BM_TAG_VALID | BM_PERMANENT;
  	else
! 		buf->flags |= BM_TAG_VALID;
! 	buf->usage_count = 1;
  
! 	UnlockBufHdr(buf);
  
  	if (oldFlags & BM_TAG_VALID)
  	{
--- 1322,1336 ----
  	 * 1 so that the buffer can survive one clock-sweep pass.)
  	 */
  	buf->tag = newTag;
! 	state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED |
! 			   BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT |
! 			   BUF_USAGECOUNT_MASK);
  	if (relpersistence == RELPERSISTENCE_PERMANENT)
! 		state |= BM_TAG_VALID | BM_PERMANENT | BUF_USAGECOUNT_ONE;
  	else
! 		state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
  
! 	UnlockBufHdr(buf, state);
  
  	if (oldFlags & BM_TAG_VALID)
  	{
*************** InvalidateBuffer(BufferDesc *buf)
*** 1338,1349 ****
  	BufferTag	oldTag;
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	BufFlags	oldFlags;
  
  	/* Save the original buffer tag before dropping the spinlock */
  	oldTag = buf->tag;
  
! 	UnlockBufHdr(buf);
  
  	/*
  	 * Need to compute the old tag's hashcode and partition lock ID. XXX is it
--- 1377,1391 ----
  	BufferTag	oldTag;
  	uint32		oldHash;		/* hash value for oldTag */
  	LWLock	   *oldPartitionLock;		/* buffer partition lock for it */
! 	uint32		oldFlags;
! 	uint32		state;
  
  	/* Save the original buffer tag before dropping the spinlock */
  	oldTag = buf->tag;
  
! 	state = pg_atomic_read_u32(&buf->state);
! 	Assert(state & BM_LOCKED);
! 	UnlockBufHdr(buf, state);
  
  	/*
  	 * Need to compute the old tag's hashcode and partition lock ID. XXX is it
*************** retry:
*** 1362,1373 ****
  	LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
  
  	/* Re-lock the buffer header */
! 	LockBufHdr(buf);
  
  	/* If it's changed while we were waiting for lock, do nothing */
  	if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
  	{
! 		UnlockBufHdr(buf);
  		LWLockRelease(oldPartitionLock);
  		return;
  	}
--- 1404,1415 ----
  	LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
  
  	/* Re-lock the buffer header */
! 	state = LockBufHdr(buf);
  
  	/* If it's changed while we were waiting for lock, do nothing */
  	if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
  	{
! 		UnlockBufHdr(buf, state);
  		LWLockRelease(oldPartitionLock);
  		return;
  	}
*************** retry:
*** 1381,1389 ****
  	 * yet done StartBufferIO, WaitIO will fall through and we'll effectively
  	 * be busy-looping here.)
  	 */
! 	if (buf->refcount != 0)
  	{
! 		UnlockBufHdr(buf);
  		LWLockRelease(oldPartitionLock);
  		/* safety check: should definitely not be our *own* pin */
  		if (GetPrivateRefCount(BufferDescriptorGetBuffer(buf)) > 0)
--- 1423,1431 ----
  	 * yet done StartBufferIO, WaitIO will fall through and we'll effectively
  	 * be busy-looping here.)
  	 */
! 	if (BUF_STATE_GET_REFCOUNT(state) != 0)
  	{
! 		UnlockBufHdr(buf, state);
  		LWLockRelease(oldPartitionLock);
  		/* safety check: should definitely not be our *own* pin */
  		if (GetPrivateRefCount(BufferDescriptorGetBuffer(buf)) > 0)
*************** retry:
*** 1396,1407 ****
  	 * Clear out the buffer's tag and flags.  We must do this to ensure that
  	 * linear scans of the buffer array don't think the buffer is valid.
  	 */
! 	oldFlags = buf->flags;
  	CLEAR_BUFFERTAG(buf->tag);
! 	buf->flags = 0;
! 	buf->usage_count = 0;
! 
! 	UnlockBufHdr(buf);
  
  	/*
  	 * Remove the buffer from the lookup hashtable, if it was in there.
--- 1438,1447 ----
  	 * Clear out the buffer's tag and flags.  We must do this to ensure that
  	 * linear scans of the buffer array don't think the buffer is valid.
  	 */
! 	oldFlags = state & BUF_FLAG_MASK;
  	CLEAR_BUFFERTAG(buf->tag);
! 	state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
! 	UnlockBufHdr(buf, state);
  
  	/*
  	 * Remove the buffer from the lookup hashtable, if it was in there.
*************** retry:
*** 1432,1438 ****
  void
  MarkBufferDirty(Buffer buffer)
  {
! 	BufferDesc *bufHdr;
  
  	if (!BufferIsValid(buffer))
  		elog(ERROR, "bad buffer ID: %d", buffer);
--- 1472,1481 ----
  void
  MarkBufferDirty(Buffer buffer)
  {
! 	BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
! 	uint32		state;
! 	bool		wasDirty;
! 
  
  	if (!BufferIsValid(buffer))
  		elog(ERROR, "bad buffer ID: %d", buffer);
*************** MarkBufferDirty(Buffer buffer)
*** 1443,1472 ****
  		return;
  	}
  
- 	bufHdr = GetBufferDescriptor(buffer - 1);
- 
  	Assert(BufferIsPinned(buffer));
  	/* unfortunately we can't check if the lock is held exclusively */
  	Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
  
! 	LockBufHdr(bufHdr);
  
! 	Assert(bufHdr->refcount > 0);
  
  	/*
  	 * If the buffer was not dirty already, do vacuum accounting.
  	 */
! 	if (!(bufHdr->flags & BM_DIRTY))
  	{
  		VacuumPageDirty++;
  		pgBufferUsage.shared_blks_dirtied++;
  		if (VacuumCostActive)
  			VacuumCostBalance += VacuumCostPageDirty;
  	}
- 
- 	bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
- 
- 	UnlockBufHdr(bufHdr);
  }
  
  /*
--- 1486,1515 ----
  		return;
  	}
  
  	Assert(BufferIsPinned(buffer));
  	/* unfortunately we can't check if the lock is held exclusively */
  	Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
  
! 	BEGIN_BUFSTATE_CAS_LOOP(bufHdr);
  
! 	Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
! 	wasDirty = (state & BM_DIRTY) ? true : false;
! 	state |= BM_DIRTY | BM_JUST_DIRTIED;
! 	if (state == oldstate)
! 		break;
! 
! 	END_BUFSTATE_CAS_LOOP(bufHdr);
  
  	/*
  	 * If the buffer was not dirty already, do vacuum accounting.
  	 */
! 	if (!wasDirty)
  	{
  		VacuumPageDirty++;
  		pgBufferUsage.shared_blks_dirtied++;
  		if (VacuumCostActive)
  			VacuumCostBalance += VacuumCostPageDirty;
  	}
  }
  
  /*
*************** ReleaseAndReadBuffer(Buffer buffer,
*** 1531,1536 ****
--- 1574,1584 ----
   *
   * This should be applied only to shared buffers, never local ones.
   *
+  * Since buffers are pinned/unpinned very frequently, this function tries
+  * to pin buffer as cheap as possible.  This is why we don't take buffer header
+  * lock here, but update state variable in loop of CAS operations. Hopefully.
+  * it would be just single CAS.
+  *
   * Note that ResourceOwnerEnlargeBuffers must have been done already.
   *
   * Returns TRUE if buffer is BM_VALID, else FALSE.  This provision allows
*************** PinBuffer(BufferDesc *buf, BufferAccessS
*** 1547,1569 ****
  
  	if (ref == NULL)
  	{
  		ReservePrivateRefCountEntry();
  		ref = NewPrivateRefCountEntry(b);
  
! 		LockBufHdr(buf);
! 		buf->refcount++;
! 		if (strategy == NULL)
! 		{
! 			if (buf->usage_count < BM_MAX_USAGE_COUNT)
! 				buf->usage_count++;
! 		}
! 		else
! 		{
! 			if (buf->usage_count == 0)
! 				buf->usage_count = 1;
! 		}
! 		result = (buf->flags & BM_VALID) != 0;
! 		UnlockBufHdr(buf);
  	}
  	else
  	{
--- 1595,1617 ----
  
  	if (ref == NULL)
  	{
+ 		/* loop of CAS operations */
+ 		uint32			state;
+ 
  		ReservePrivateRefCountEntry();
  		ref = NewPrivateRefCountEntry(b);
  
! 		BEGIN_BUFSTATE_CAS_LOOP(buf);
! 
! 		/* increase refcount */
! 		state += BUF_REFCOUNT_ONE;
! 
! 		/* increase usagecount unless already max */
! 		if (BUF_STATE_GET_USAGECOUNT(state) != BM_MAX_USAGE_COUNT)
! 			state += BUF_USAGECOUNT_ONE;
! 
! 		END_BUFSTATE_CAS_LOOP(buf);
! 		result = (state & BM_VALID) != 0;
  	}
  	else
  	{
*************** PinBuffer_Locked(BufferDesc *buf)
*** 1603,1608 ****
--- 1651,1657 ----
  {
  	Buffer		b;
  	PrivateRefCountEntry *ref;
+ 	uint32		state;
  
  	/*
  	 * As explained, We don't expect any preexisting pins. That allows us to
*************** PinBuffer_Locked(BufferDesc *buf)
*** 1610,1617 ****
  	 */
  	Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
  
! 	buf->refcount++;
! 	UnlockBufHdr(buf);
  
  	b = BufferDescriptorGetBuffer(buf);
  
--- 1659,1672 ----
  	 */
  	Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
  
! 	/*
! 	 * Since we assume to held buffer spinlock, we can update the buffer
! 	 * state in a single write operation.
! 	 */
! 	state = pg_atomic_read_u32(&buf->state);
! 	Assert(state & BM_LOCKED);
! 	state += BUF_REFCOUNT_ONE;
! 	UnlockBufHdr(buf, state);
  
  	b = BufferDescriptorGetBuffer(buf);
  
*************** UnpinBuffer(BufferDesc *buf, bool fixOwn
*** 1646,1675 ****
  	ref->refcount--;
  	if (ref->refcount == 0)
  	{
  		/* I'd better not still hold any locks on the buffer */
  		Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
  		Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
  
! 		LockBufHdr(buf);
  
! 		/* Decrement the shared reference count */
! 		Assert(buf->refcount > 0);
! 		buf->refcount--;
  
  		/* Support LockBufferForCleanup() */
! 		if ((buf->flags & BM_PIN_COUNT_WAITER) &&
! 			buf->refcount == 1)
  		{
! 			/* we just released the last pin other than the waiter's */
! 			int			wait_backend_pid = buf->wait_backend_pid;
  
! 			buf->flags &= ~BM_PIN_COUNT_WAITER;
! 			UnlockBufHdr(buf);
! 			ProcSendSignal(wait_backend_pid);
! 		}
! 		else
! 			UnlockBufHdr(buf);
  
  		ForgetPrivateRefCountEntry(ref);
  	}
  }
--- 1701,1741 ----
  	ref->refcount--;
  	if (ref->refcount == 0)
  	{
+ 		uint32			state;
+ 
  		/* I'd better not still hold any locks on the buffer */
  		Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
  		Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
  
! 		/*
! 		 * Decrement the shared reference count.
! 		 *
! 		 * Since buffer spinlock holder can update status using just write,
! 		 * it's not safe to use atomic decrement here.  We are doing loop of
! 		 * CAS operations like PinBuffer does.
! 		 */
  
! 		BEGIN_BUFSTATE_CAS_LOOP(buf);
! 		state -= BUF_REFCOUNT_ONE;
! 		END_BUFSTATE_CAS_LOOP(buf);
  
  		/* Support LockBufferForCleanup() */
! 		if (state & BM_PIN_COUNT_WAITER)
  		{
! 			state = LockBufHdr(buf);
  
! 			if (state & BM_PIN_COUNT_WAITER && BUF_STATE_GET_REFCOUNT(state) == 1)
! 			{
! 				/* we just released the last pin other than the waiter's */
! 				int			wait_backend_pid = buf->wait_backend_pid;
  
+ 				state &= ~BM_PIN_COUNT_WAITER;
+ 				UnlockBufHdr(buf, state);
+ 				ProcSendSignal(wait_backend_pid);
+ 			}
+ 			else
+ 				UnlockBufHdr(buf, state);
+ 		}
  		ForgetPrivateRefCountEntry(ref);
  	}
  }
*************** UnpinBuffer(BufferDesc *buf, bool fixOwn
*** 1687,1692 ****
--- 1753,1759 ----
  static void
  BufferSync(int flags)
  {
+ 	uint32		state;
  	int			buf_id;
  	int			num_to_scan;
  	int			num_spaces;
*************** BufferSync(int flags)
*** 1736,1748 ****
  		 * Header spinlock is enough to examine BM_DIRTY, see comment in
  		 * SyncOneBuffer.
  		 */
! 		LockBufHdr(bufHdr);
  
! 		if ((bufHdr->flags & mask) == mask)
  		{
  			CkptSortItem *item;
  
! 			bufHdr->flags |= BM_CHECKPOINT_NEEDED;
  
  			item = &CkptBufferIds[num_to_scan++];
  			item->buf_id = buf_id;
--- 1803,1815 ----
  		 * Header spinlock is enough to examine BM_DIRTY, see comment in
  		 * SyncOneBuffer.
  		 */
! 		state = LockBufHdr(bufHdr);
  
! 		if ((state & mask) == mask)
  		{
  			CkptSortItem *item;
  
! 			state |= BM_CHECKPOINT_NEEDED;
  
  			item = &CkptBufferIds[num_to_scan++];
  			item->buf_id = buf_id;
*************** BufferSync(int flags)
*** 1752,1758 ****
  			item->blockNum = bufHdr->tag.blockNum;
  		}
  
! 		UnlockBufHdr(bufHdr);
  	}
  
  	if (num_to_scan == 0)
--- 1819,1825 ----
  			item->blockNum = bufHdr->tag.blockNum;
  		}
  
! 		UnlockBufHdr(bufHdr, state);
  	}
  
  	if (num_to_scan == 0)
*************** BufferSync(int flags)
*** 1888,1894 ****
  		 * write the buffer though we didn't need to.  It doesn't seem worth
  		 * guarding against this, though.
  		 */
! 		if (bufHdr->flags & BM_CHECKPOINT_NEEDED)
  		{
  			if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
  			{
--- 1955,1961 ----
  		 * write the buffer though we didn't need to.  It doesn't seem worth
  		 * guarding against this, though.
  		 */
! 		if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
  		{
  			if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
  			{
*************** SyncOneBuffer(int buf_id, bool skip_rece
*** 2258,2263 ****
--- 2325,2331 ----
  {
  	BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
  	int			result = 0;
+ 	uint32		state;
  	BufferTag	tag;
  
  	ReservePrivateRefCountEntry();
*************** SyncOneBuffer(int buf_id, bool skip_rece
*** 2271,2291 ****
  	 * don't worry because our checkpoint.redo points before log record for
  	 * upcoming changes and so we are not required to write such dirty buffer.
  	 */
! 	LockBufHdr(bufHdr);
  
! 	if (bufHdr->refcount == 0 && bufHdr->usage_count == 0)
  		result |= BUF_REUSABLE;
  	else if (skip_recently_used)
  	{
  		/* Caller told us not to write recently-used buffers */
! 		UnlockBufHdr(bufHdr);
  		return result;
  	}
  
! 	if (!(bufHdr->flags & BM_VALID) || !(bufHdr->flags & BM_DIRTY))
  	{
  		/* It's clean, so nothing to do */
! 		UnlockBufHdr(bufHdr);
  		return result;
  	}
  
--- 2339,2362 ----
  	 * don't worry because our checkpoint.redo points before log record for
  	 * upcoming changes and so we are not required to write such dirty buffer.
  	 */
! 	state = LockBufHdr(bufHdr);
  
! 	if (BUF_STATE_GET_REFCOUNT(state) == 0 &&
! 		BUF_STATE_GET_USAGECOUNT(state) == 0)
! 	{
  		result |= BUF_REUSABLE;
+ 	}
  	else if (skip_recently_used)
  	{
  		/* Caller told us not to write recently-used buffers */
! 		UnlockBufHdr(bufHdr, state);
  		return result;
  	}
  
! 	if (!(state & BM_VALID) || !(state & BM_DIRTY))
  	{
  		/* It's clean, so nothing to do */
! 		UnlockBufHdr(bufHdr, state);
  		return result;
  	}
  
*************** PrintBufferLeakWarning(Buffer buffer)
*** 2439,2444 ****
--- 2510,2516 ----
  	int32		loccount;
  	char	   *path;
  	BackendId	backend;
+ 	uint32		state;
  
  	Assert(BufferIsValid(buffer));
  	if (BufferIsLocal(buffer))
*************** PrintBufferLeakWarning(Buffer buffer)
*** 2456,2467 ****
  
  	/* theoretically we should lock the bufhdr here */
  	path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
  	elog(WARNING,
  		 "buffer refcount leak: [%03d] "
  		 "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
  		 buffer, path,
! 		 buf->tag.blockNum, buf->flags,
! 		 buf->refcount, loccount);
  	pfree(path);
  }
  
--- 2528,2540 ----
  
  	/* theoretically we should lock the bufhdr here */
  	path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
+ 	state = pg_atomic_read_u32(&buf->state);
  	elog(WARNING,
  		 "buffer refcount leak: [%03d] "
  		 "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
  		 buffer, path,
! 		 buf->tag.blockNum, state & BUF_FLAG_MASK,
! 		 BUF_STATE_GET_REFCOUNT(state), loccount);
  	pfree(path);
  }
  
*************** FlushBuffer(BufferDesc *buf, SMgrRelatio
*** 2573,2578 ****
--- 2646,2652 ----
  				io_time;
  	Block		bufBlock;
  	char	   *bufToWrite;
+ 	uint32		state;
  
  	/*
  	 * Acquire the buffer's io_in_progress lock.  If StartBufferIO returns
*************** FlushBuffer(BufferDesc *buf, SMgrRelatio
*** 2598,2604 ****
  										reln->smgr_rnode.node.dbNode,
  										reln->smgr_rnode.node.relNode);
  
! 	LockBufHdr(buf);
  
  	/*
  	 * Run PageGetLSN while holding header lock, since we don't have the
--- 2672,2678 ----
  										reln->smgr_rnode.node.dbNode,
  										reln->smgr_rnode.node.relNode);
  
! 	state = LockBufHdr(buf);
  
  	/*
  	 * Run PageGetLSN while holding header lock, since we don't have the
*************** FlushBuffer(BufferDesc *buf, SMgrRelatio
*** 2607,2614 ****
  	recptr = BufferGetLSN(buf);
  
  	/* To check if block content changes while flushing. - vadim 01/17/97 */
! 	buf->flags &= ~BM_JUST_DIRTIED;
! 	UnlockBufHdr(buf);
  
  	/*
  	 * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
--- 2681,2688 ----
  	recptr = BufferGetLSN(buf);
  
  	/* To check if block content changes while flushing. - vadim 01/17/97 */
! 	state &= ~BM_JUST_DIRTIED;
! 	UnlockBufHdr(buf, state);
  
  	/*
  	 * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
*************** FlushBuffer(BufferDesc *buf, SMgrRelatio
*** 2627,2633 ****
  	 * disastrous system-wide consequences.  To make sure that can't happen,
  	 * skip the flush if the buffer isn't permanent.
  	 */
! 	if (buf->flags & BM_PERMANENT)
  		XLogFlush(recptr);
  
  	/*
--- 2701,2707 ----
  	 * disastrous system-wide consequences.  To make sure that can't happen,
  	 * skip the flush if the buffer isn't permanent.
  	 */
! 	if (state & BM_PERMANENT)
  		XLogFlush(recptr);
  
  	/*
*************** BufferIsPermanent(Buffer buffer)
*** 2716,2727 ****
  	/*
  	 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
  	 * need not bother with the buffer header spinlock.  Even if someone else
! 	 * changes the buffer header flags while we're doing this, we assume that
! 	 * changing an aligned 2-byte BufFlags value is atomic, so we'll read the
! 	 * old value or the new value, but not random garbage.
  	 */
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	return (bufHdr->flags & BM_PERMANENT) != 0;
  }
  
  /*
--- 2790,2801 ----
  	/*
  	 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
  	 * need not bother with the buffer header spinlock.  Even if someone else
! 	 * changes the buffer header state while we're doing this, changing of
! 	 * state is atomic, so we'll read the old value or the new value, but not
! 	 * random garbage.
  	 */
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
  }
  
  /*
*************** BufferGetLSNAtomic(Buffer buffer)
*** 2736,2741 ****
--- 2810,2816 ----
  	BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
  	char	   *page = BufferGetPage(buffer);
  	XLogRecPtr	lsn;
+ 	uint32		state;
  
  	/*
  	 * If we don't need locking for correctness, fastpath out.
*************** BufferGetLSNAtomic(Buffer buffer)
*** 2747,2755 ****
  	Assert(BufferIsValid(buffer));
  	Assert(BufferIsPinned(buffer));
  
! 	LockBufHdr(bufHdr);
  	lsn = PageGetLSN(page);
! 	UnlockBufHdr(bufHdr);
  
  	return lsn;
  }
--- 2822,2830 ----
  	Assert(BufferIsValid(buffer));
  	Assert(BufferIsPinned(buffer));
  
! 	state = LockBufHdr(bufHdr);
  	lsn = PageGetLSN(page);
! 	UnlockBufHdr(bufHdr, state);
  
  	return lsn;
  }
*************** DropRelFileNodeBuffers(RelFileNodeBacken
*** 2797,2802 ****
--- 2872,2878 ----
  	for (i = 0; i < NBuffers; i++)
  	{
  		BufferDesc *bufHdr = GetBufferDescriptor(i);
+ 		uint32		state;
  
  		/*
  		 * We can make this a tad faster by prechecking the buffer tag before
*************** DropRelFileNodeBuffers(RelFileNodeBacken
*** 2817,2829 ****
  		if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
  			continue;
  
! 		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
  			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
! 			UnlockBufHdr(bufHdr);
  	}
  }
  
--- 2893,2905 ----
  		if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
  			continue;
  
! 		state = LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
  			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
! 			UnlockBufHdr(bufHdr, state);
  	}
  }
  
*************** DropRelFileNodesAllBuffers(RelFileNodeBa
*** 2887,2892 ****
--- 2963,2969 ----
  	{
  		RelFileNode *rnode = NULL;
  		BufferDesc *bufHdr = GetBufferDescriptor(i);
+ 		uint32		state;
  
  		/*
  		 * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
*************** DropRelFileNodesAllBuffers(RelFileNodeBa
*** 2917,2927 ****
  		if (rnode == NULL)
  			continue;
  
! 		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
  			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
! 			UnlockBufHdr(bufHdr);
  	}
  
  	pfree(nodes);
--- 2994,3004 ----
  		if (rnode == NULL)
  			continue;
  
! 		state = LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
  			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
! 			UnlockBufHdr(bufHdr, state);
  	}
  
  	pfree(nodes);
*************** DropDatabaseBuffers(Oid dbid)
*** 2951,2956 ****
--- 3028,3034 ----
  	for (i = 0; i < NBuffers; i++)
  	{
  		BufferDesc *bufHdr = GetBufferDescriptor(i);
+ 		uint32		state;
  
  		/*
  		 * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
*************** DropDatabaseBuffers(Oid dbid)
*** 2959,2969 ****
  		if (bufHdr->tag.rnode.dbNode != dbid)
  			continue;
  
! 		LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid)
  			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
! 			UnlockBufHdr(bufHdr);
  	}
  }
  
--- 3037,3047 ----
  		if (bufHdr->tag.rnode.dbNode != dbid)
  			continue;
  
! 		state = LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid)
  			InvalidateBuffer(bufHdr);	/* releases spinlock */
  		else
! 			UnlockBufHdr(bufHdr, state);
  	}
  }
  
*************** FlushRelationBuffers(Relation rel)
*** 3055,3063 ****
  	{
  		for (i = 0; i < NLocBuffer; i++)
  		{
  			bufHdr = GetLocalBufferDescriptor(i);
  			if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 				(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
  			{
  				ErrorContextCallback errcallback;
  				Page		localpage;
--- 3133,3144 ----
  	{
  		for (i = 0; i < NLocBuffer; i++)
  		{
+ 			uint32	state;
+ 
  			bufHdr = GetLocalBufferDescriptor(i);
  			if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 				((state = pg_atomic_read_u32(&bufHdr->state)) &
! 					(BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
  			{
  				ErrorContextCallback errcallback;
  				Page		localpage;
*************** FlushRelationBuffers(Relation rel)
*** 3078,3084 ****
  						  localpage,
  						  false);
  
! 				bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
  
  				/* Pop the error context stack */
  				error_context_stack = errcallback.previous;
--- 3159,3166 ----
  						  localpage,
  						  false);
  
! 				state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
! 				pg_atomic_write_u32(&bufHdr->state, state);
  
  				/* Pop the error context stack */
  				error_context_stack = errcallback.previous;
*************** FlushRelationBuffers(Relation rel)
*** 3093,3098 ****
--- 3175,3182 ----
  
  	for (i = 0; i < NBuffers; i++)
  	{
+ 		uint32 state;
+ 
  		bufHdr = GetBufferDescriptor(i);
  
  		/*
*************** FlushRelationBuffers(Relation rel)
*** 3104,3112 ****
  
  		ReservePrivateRefCountEntry();
  
! 		LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
--- 3188,3196 ----
  
  		ReservePrivateRefCountEntry();
  
! 		state = LockBufHdr(bufHdr);
  		if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
! 			(state & (BM_VALID | BM_DIRTY))	== (BM_VALID | BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
*************** FlushRelationBuffers(Relation rel)
*** 3115,3121 ****
  			UnpinBuffer(bufHdr, true);
  		}
  		else
! 			UnlockBufHdr(bufHdr);
  	}
  }
  
--- 3199,3205 ----
  			UnpinBuffer(bufHdr, true);
  		}
  		else
! 			UnlockBufHdr(bufHdr, state);
  	}
  }
  
*************** FlushDatabaseBuffers(Oid dbid)
*** 3145,3150 ****
--- 3229,3235 ----
  
  	for (i = 0; i < NBuffers; i++)
  	{
+ 		uint32	state;
  		bufHdr = GetBufferDescriptor(i);
  
  		/*
*************** FlushDatabaseBuffers(Oid dbid)
*** 3156,3164 ****
  
  		ReservePrivateRefCountEntry();
  
! 		LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid &&
! 			(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
--- 3241,3249 ----
  
  		ReservePrivateRefCountEntry();
  
! 		state = LockBufHdr(bufHdr);
  		if (bufHdr->tag.rnode.dbNode == dbid &&
! 			(state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
  		{
  			PinBuffer_Locked(bufHdr);
  			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
*************** FlushDatabaseBuffers(Oid dbid)
*** 3167,3173 ****
  			UnpinBuffer(bufHdr, true);
  		}
  		else
! 			UnlockBufHdr(bufHdr);
  	}
  }
  
--- 3252,3258 ----
  			UnpinBuffer(bufHdr, true);
  		}
  		else
! 			UnlockBufHdr(bufHdr, state);
  	}
  }
  
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3297,3308 ****
  	 * is only intended to be used in cases where failing to write out the
  	 * data would be harmless anyway, it doesn't really matter.
  	 */
! 	if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) !=
  		(BM_DIRTY | BM_JUST_DIRTIED))
  	{
  		XLogRecPtr	lsn = InvalidXLogRecPtr;
  		bool		dirtied = false;
  		bool		delayChkpt = false;
  
  		/*
  		 * If we need to protect hint bit updates from torn writes, WAL-log a
--- 3382,3394 ----
  	 * is only intended to be used in cases where failing to write out the
  	 * data would be harmless anyway, it doesn't really matter.
  	 */
! 	if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
  		(BM_DIRTY | BM_JUST_DIRTIED))
  	{
  		XLogRecPtr	lsn = InvalidXLogRecPtr;
  		bool		dirtied = false;
  		bool		delayChkpt = false;
+ 		uint32		state;
  
  		/*
  		 * If we need to protect hint bit updates from torn writes, WAL-log a
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3313,3319 ****
  		 * We don't check full_page_writes here because that logic is included
  		 * when we call XLogInsert() since the value changes dynamically.
  		 */
! 		if (XLogHintBitIsNeeded() && (bufHdr->flags & BM_PERMANENT))
  		{
  			/*
  			 * If we're in recovery we cannot dirty a page because of a hint.
--- 3399,3405 ----
  		 * We don't check full_page_writes here because that logic is included
  		 * when we call XLogInsert() since the value changes dynamically.
  		 */
! 		if (XLogHintBitIsNeeded() && (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
  		{
  			/*
  			 * If we're in recovery we cannot dirty a page because of a hint.
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3352,3360 ****
  			lsn = XLogSaveBufferForHint(buffer, buffer_std);
  		}
  
! 		LockBufHdr(bufHdr);
! 		Assert(bufHdr->refcount > 0);
! 		if (!(bufHdr->flags & BM_DIRTY))
  		{
  			dirtied = true;		/* Means "will be dirtied by this action" */
  
--- 3438,3448 ----
  			lsn = XLogSaveBufferForHint(buffer, buffer_std);
  		}
  
! 		state = LockBufHdr(bufHdr);
! 
! 		Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
! 
! 		if (!(state & BM_DIRTY))
  		{
  			dirtied = true;		/* Means "will be dirtied by this action" */
  
*************** MarkBufferDirtyHint(Buffer buffer, bool 
*** 3374,3381 ****
  			if (!XLogRecPtrIsInvalid(lsn))
  				PageSetLSN(page, lsn);
  		}
! 		bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
! 		UnlockBufHdr(bufHdr);
  
  		if (delayChkpt)
  			MyPgXact->delayChkpt = false;
--- 3462,3470 ----
  			if (!XLogRecPtrIsInvalid(lsn))
  				PageSetLSN(page, lsn);
  		}
! 
! 		state |= BM_DIRTY | BM_JUST_DIRTIED;
! 		UnlockBufHdr(bufHdr, state);
  
  		if (delayChkpt)
  			MyPgXact->delayChkpt = false;
*************** UnlockBuffers(void)
*** 3406,3422 ****
  
  	if (buf)
  	{
! 		LockBufHdr(buf);
  
  		/*
  		 * Don't complain if flag bit not set; it could have been reset but we
  		 * got a cancel/die interrupt before getting the signal.
  		 */
! 		if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 &&
  			buf->wait_backend_pid == MyProcPid)
! 			buf->flags &= ~BM_PIN_COUNT_WAITER;
  
! 		UnlockBufHdr(buf);
  
  		PinCountWaitBuf = NULL;
  	}
--- 3495,3513 ----
  
  	if (buf)
  	{
! 		uint32	state;
! 
! 		state = LockBufHdr(buf);
  
  		/*
  		 * Don't complain if flag bit not set; it could have been reset but we
  		 * got a cancel/die interrupt before getting the signal.
  		 */
! 		if ((state & BM_PIN_COUNT_WAITER) != 0 &&
  			buf->wait_backend_pid == MyProcPid)
! 			state &= ~BM_PIN_COUNT_WAITER;
  
! 		UnlockBufHdr(buf, state);
  
  		PinCountWaitBuf = NULL;
  	}
*************** LockBufferForCleanup(Buffer buffer)
*** 3509,3535 ****
  
  	for (;;)
  	{
  		/* Try to acquire lock */
  		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
! 		LockBufHdr(bufHdr);
! 		Assert(bufHdr->refcount > 0);
! 		if (bufHdr->refcount == 1)
  		{
  			/* Successfully acquired exclusive lock with pincount 1 */
! 			UnlockBufHdr(bufHdr);
  			return;
  		}
  		/* Failed, so mark myself as waiting for pincount 1 */
! 		if (bufHdr->flags & BM_PIN_COUNT_WAITER)
  		{
! 			UnlockBufHdr(bufHdr);
  			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  			elog(ERROR, "multiple backends attempting to wait for pincount 1");
  		}
  		bufHdr->wait_backend_pid = MyProcPid;
- 		bufHdr->flags |= BM_PIN_COUNT_WAITER;
  		PinCountWaitBuf = bufHdr;
! 		UnlockBufHdr(bufHdr);
  		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  
  		/* Report the wait */
--- 3600,3629 ----
  
  	for (;;)
  	{
+ 		uint32	state;
+ 
  		/* Try to acquire lock */
  		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
! 		state = LockBufHdr(bufHdr);
! 
! 		Assert(BUF_STATE_GET_REFCOUNT(state) > 0);
! 		if (BUF_STATE_GET_REFCOUNT(state) == 1)
  		{
  			/* Successfully acquired exclusive lock with pincount 1 */
! 			UnlockBufHdr(bufHdr, state);
  			return;
  		}
  		/* Failed, so mark myself as waiting for pincount 1 */
! 		if (state & BM_PIN_COUNT_WAITER)
  		{
! 			UnlockBufHdr(bufHdr, state);
  			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  			elog(ERROR, "multiple backends attempting to wait for pincount 1");
  		}
  		bufHdr->wait_backend_pid = MyProcPid;
  		PinCountWaitBuf = bufHdr;
! 		state |= BM_PIN_COUNT_WAITER;
! 		UnlockBufHdr(bufHdr, state);
  		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  
  		/* Report the wait */
*************** LockBufferForCleanup(Buffer buffer)
*** 3558,3568 ****
  		 * impossible with the current usages due to table level locking, but
  		 * better be safe.
  		 */
! 		LockBufHdr(bufHdr);
! 		if ((bufHdr->flags & BM_PIN_COUNT_WAITER) != 0 &&
  			bufHdr->wait_backend_pid == MyProcPid)
! 			bufHdr->flags &= ~BM_PIN_COUNT_WAITER;
! 		UnlockBufHdr(bufHdr);
  
  		PinCountWaitBuf = NULL;
  		/* Loop back and try again */
--- 3652,3662 ----
  		 * impossible with the current usages due to table level locking, but
  		 * better be safe.
  		 */
! 		state = LockBufHdr(bufHdr);
! 		if ((state & BM_PIN_COUNT_WAITER) != 0 &&
  			bufHdr->wait_backend_pid == MyProcPid)
! 			state &= ~BM_PIN_COUNT_WAITER;
! 		UnlockBufHdr(bufHdr, state);
  
  		PinCountWaitBuf = NULL;
  		/* Loop back and try again */
*************** bool
*** 3603,3624 ****
  ConditionalLockBufferForCleanup(Buffer buffer)
  {
  	BufferDesc *bufHdr;
  
  	Assert(BufferIsValid(buffer));
  
  	if (BufferIsLocal(buffer))
  	{
  		/* There should be exactly one pin */
! 		Assert(LocalRefCount[-buffer - 1] > 0);
! 		if (LocalRefCount[-buffer - 1] != 1)
  			return false;
  		/* Nobody else to wait for */
  		return true;
  	}
  
  	/* There should be exactly one local pin */
! 	Assert(GetPrivateRefCount(buffer) > 0);
! 	if (GetPrivateRefCount(buffer) != 1)
  		return false;
  
  	/* Try to acquire lock */
--- 3697,3722 ----
  ConditionalLockBufferForCleanup(Buffer buffer)
  {
  	BufferDesc *bufHdr;
+ 	uint32		state,
+ 				refcount;
  
  	Assert(BufferIsValid(buffer));
  
  	if (BufferIsLocal(buffer))
  	{
+ 		refcount = LocalRefCount[-buffer - 1];
  		/* There should be exactly one pin */
! 		Assert(refcount > 0);
! 		if (refcount != 1)
  			return false;
  		/* Nobody else to wait for */
  		return true;
  	}
  
  	/* There should be exactly one local pin */
! 	refcount = GetPrivateRefCount(buffer);
! 	Assert(refcount);
! 	if (refcount != 1)
  		return false;
  
  	/* Try to acquire lock */
*************** ConditionalLockBufferForCleanup(Buffer b
*** 3626,3642 ****
  		return false;
  
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	LockBufHdr(bufHdr);
! 	Assert(bufHdr->refcount > 0);
! 	if (bufHdr->refcount == 1)
  	{
  		/* Successfully acquired exclusive lock with pincount 1 */
! 		UnlockBufHdr(bufHdr);
  		return true;
  	}
  
  	/* Failed, so release the lock */
! 	UnlockBufHdr(bufHdr);
  	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  	return false;
  }
--- 3724,3742 ----
  		return false;
  
  	bufHdr = GetBufferDescriptor(buffer - 1);
! 	state = LockBufHdr(bufHdr);
! 	refcount = BUF_STATE_GET_REFCOUNT(state);
! 
! 	Assert(refcount > 0);
! 	if (refcount == 1)
  	{
  		/* Successfully acquired exclusive lock with pincount 1 */
! 		UnlockBufHdr(bufHdr, state);
  		return true;
  	}
  
  	/* Failed, so release the lock */
! 	UnlockBufHdr(bufHdr, state);
  	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
  	return false;
  }
*************** WaitIO(BufferDesc *buf)
*** 3666,3682 ****
  	 */
  	for (;;)
  	{
! 		BufFlags	sv_flags;
  
  		/*
  		 * It may not be necessary to acquire the spinlock to check the flag
  		 * here, but since this test is essential for correctness, we'd better
  		 * play it safe.
  		 */
! 		LockBufHdr(buf);
! 		sv_flags = buf->flags;
! 		UnlockBufHdr(buf);
! 		if (!(sv_flags & BM_IO_IN_PROGRESS))
  			break;
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED);
  		LWLockRelease(BufferDescriptorGetIOLock(buf));
--- 3766,3782 ----
  	 */
  	for (;;)
  	{
! 		uint32		state;
  
  		/*
  		 * It may not be necessary to acquire the spinlock to check the flag
  		 * here, but since this test is essential for correctness, we'd better
  		 * play it safe.
  		 */
! 		state = LockBufHdr(buf);
! 		UnlockBufHdr(buf, state);
! 
! 		if (!(state & BM_IO_IN_PROGRESS))
  			break;
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED);
  		LWLockRelease(BufferDescriptorGetIOLock(buf));
*************** WaitIO(BufferDesc *buf)
*** 3704,3709 ****
--- 3804,3811 ----
  static bool
  StartBufferIO(BufferDesc *buf, bool forInput)
  {
+ 	uint32		state;
+ 
  	Assert(!InProgressBuf);
  
  	for (;;)
*************** StartBufferIO(BufferDesc *buf, bool forI
*** 3714,3722 ****
  		 */
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
  
! 		LockBufHdr(buf);
  
! 		if (!(buf->flags & BM_IO_IN_PROGRESS))
  			break;
  
  		/*
--- 3816,3824 ----
  		 */
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
  
! 		state = LockBufHdr(buf);
  
! 		if (!(state & BM_IO_IN_PROGRESS))
  			break;
  
  		/*
*************** StartBufferIO(BufferDesc *buf, bool forI
*** 3725,3748 ****
  		 * an error (see AbortBufferIO).  If that's the case, we must wait for
  		 * him to get unwedged.
  		 */
! 		UnlockBufHdr(buf);
  		LWLockRelease(BufferDescriptorGetIOLock(buf));
  		WaitIO(buf);
  	}
  
  	/* Once we get here, there is definitely no I/O active on this buffer */
  
! 	if (forInput ? (buf->flags & BM_VALID) : !(buf->flags & BM_DIRTY))
  	{
  		/* someone else already did the I/O */
! 		UnlockBufHdr(buf);
  		LWLockRelease(BufferDescriptorGetIOLock(buf));
  		return false;
  	}
  
! 	buf->flags |= BM_IO_IN_PROGRESS;
! 
! 	UnlockBufHdr(buf);
  
  	InProgressBuf = buf;
  	IsForInput = forInput;
--- 3827,3849 ----
  		 * an error (see AbortBufferIO).  If that's the case, we must wait for
  		 * him to get unwedged.
  		 */
! 		UnlockBufHdr(buf, state);
  		LWLockRelease(BufferDescriptorGetIOLock(buf));
  		WaitIO(buf);
  	}
  
  	/* Once we get here, there is definitely no I/O active on this buffer */
  
! 	if (forInput ? (state & BM_VALID) : !(state & BM_DIRTY))
  	{
  		/* someone else already did the I/O */
! 		UnlockBufHdr(buf, state);
  		LWLockRelease(BufferDescriptorGetIOLock(buf));
  		return false;
  	}
  
! 	state |= BM_IO_IN_PROGRESS;
! 	UnlockBufHdr(buf, state);
  
  	InProgressBuf = buf;
  	IsForInput = forInput;
*************** StartBufferIO(BufferDesc *buf, bool forI
*** 3768,3786 ****
   * be 0, or BM_VALID if we just finished reading in the page.
   */
  static void
! TerminateBufferIO(BufferDesc *buf, bool clear_dirty, int set_flag_bits)
  {
  	Assert(buf == InProgressBuf);
  
! 	LockBufHdr(buf);
  
! 	Assert(buf->flags & BM_IO_IN_PROGRESS);
! 	buf->flags &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
! 	if (clear_dirty && !(buf->flags & BM_JUST_DIRTIED))
! 		buf->flags &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
! 	buf->flags |= set_flag_bits;
  
! 	UnlockBufHdr(buf);
  
  	InProgressBuf = NULL;
  
--- 3869,3890 ----
   * be 0, or BM_VALID if we just finished reading in the page.
   */
  static void
! TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
  {
+ 	uint32		state;
+ 
  	Assert(buf == InProgressBuf);
  
! 	state = LockBufHdr(buf);
  
! 	Assert(state & BM_IO_IN_PROGRESS);
  
! 	state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
! 	if (clear_dirty && !(state & BM_JUST_DIRTIED))
! 		state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
! 
! 	state |= set_flag_bits;
! 	UnlockBufHdr(buf, state);
  
  	InProgressBuf = NULL;
  
*************** AbortBufferIO(void)
*** 3803,3808 ****
--- 3907,3913 ----
  
  	if (buf)
  	{
+ 		uint32	state;
  		/*
  		 * Since LWLockReleaseAll has already been called, we're not holding
  		 * the buffer's io_in_progress_lock. We have to re-acquire it so that
*************** AbortBufferIO(void)
*** 3811,3834 ****
  		 */
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
  
! 		LockBufHdr(buf);
! 		Assert(buf->flags & BM_IO_IN_PROGRESS);
  		if (IsForInput)
  		{
! 			Assert(!(buf->flags & BM_DIRTY));
  			/* We'd better not think buffer is valid yet */
! 			Assert(!(buf->flags & BM_VALID));
! 			UnlockBufHdr(buf);
  		}
  		else
  		{
! 			BufFlags	sv_flags;
! 
! 			sv_flags = buf->flags;
! 			Assert(sv_flags & BM_DIRTY);
! 			UnlockBufHdr(buf);
  			/* Issue notice if this is not the first failure... */
! 			if (sv_flags & BM_IO_ERROR)
  			{
  				/* Buffer is pinned, so we can read tag without spinlock */
  				char	   *path;
--- 3916,3937 ----
  		 */
  		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
  
! 		state = LockBufHdr(buf);
! 		Assert(state & BM_IO_IN_PROGRESS);
  		if (IsForInput)
  		{
! 			Assert(!(state & BM_DIRTY));
! 
  			/* We'd better not think buffer is valid yet */
! 			Assert(!(state & BM_VALID));
! 			UnlockBufHdr(buf, state);
  		}
  		else
  		{
! 			Assert(state & BM_DIRTY);
! 			UnlockBufHdr(buf, state);
  			/* Issue notice if this is not the first failure... */
! 			if (state & BM_IO_ERROR)
  			{
  				/* Buffer is pinned, so we can read tag without spinlock */
  				char	   *path;
*************** rnode_comparator(const void *p1, const v
*** 3912,3917 ****
--- 4015,4041 ----
  }
  
  /*
+  * Lock buffer header - set BM_LOCKED in buffer state.
+  */
+ uint32
+ LockBufHdr(BufferDesc *desc)
+ {
+ 	SpinDelayStatus	delayStatus = init_spin_delay((Pointer)(desc));
+ 	uint32			oldstate;
+ 	while (true)
+ 	{
+ 		/* set BM_LOCKED flag */
+ 		oldstate = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
+ 		/* if it wasn't set before we're OK */
+ 		if (!(oldstate & BM_LOCKED))
+ 			break;
+ 		perform_spin_delay(&delayStatus);
+ 	}
+ 	finish_spin_delay(&delayStatus);
+ 	return oldstate | BM_LOCKED;
+ }
+ 
+ /*
   * BufferTag comparator.
   */
  static int
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
new file mode 100644
index 551d152..5ee8ff3
*** a/src/backend/storage/buffer/freelist.c
--- b/src/backend/storage/buffer/freelist.c
*************** typedef struct BufferAccessStrategyData
*** 98,104 ****
  
  
  /* Prototypes for internal functions */
! static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy);
  static void AddBufferToRing(BufferAccessStrategy strategy,
  				BufferDesc *buf);
  
--- 98,105 ----
  
  
  /* Prototypes for internal functions */
! static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy,
! 											  uint32 *lockstate);
  static void AddBufferToRing(BufferAccessStrategy strategy,
  				BufferDesc *buf);
  
*************** ClockSweepTick(void)
*** 180,186 ****
   *	return the buffer with the buffer header spinlock still held.
   */
  BufferDesc *
! StrategyGetBuffer(BufferAccessStrategy strategy)
  {
  	BufferDesc *buf;
  	int			bgwprocno;
--- 181,187 ----
   *	return the buffer with the buffer header spinlock still held.
   */
  BufferDesc *
! StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *lockstate)
  {
  	BufferDesc *buf;
  	int			bgwprocno;
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 192,198 ****
  	 */
  	if (strategy != NULL)
  	{
! 		buf = GetBufferFromRing(strategy);
  		if (buf != NULL)
  			return buf;
  	}
--- 193,199 ----
  	 */
  	if (strategy != NULL)
  	{
! 		buf = GetBufferFromRing(strategy, lockstate);
  		if (buf != NULL)
  			return buf;
  	}
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 250,255 ****
--- 251,258 ----
  	{
  		while (true)
  		{
+ 			uint32	state;
+ 
  			/* Acquire the spinlock to remove element from the freelist */
  			SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
  
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 279,292 ****
  			 * it before we got to it.  It's probably impossible altogether as
  			 * of 8.3, but we'd better check anyway.)
  			 */
! 			LockBufHdr(buf);
! 			if (buf->refcount == 0 && buf->usage_count == 0)
  			{
  				if (strategy != NULL)
  					AddBufferToRing(strategy, buf);
  				return buf;
  			}
! 			UnlockBufHdr(buf);
  
  		}
  	}
--- 282,297 ----
  			 * it before we got to it.  It's probably impossible altogether as
  			 * of 8.3, but we'd better check anyway.)
  			 */
! 			state = LockBufHdr(buf);
! 			if (BUF_STATE_GET_REFCOUNT(state) == 0
! 				&& BUF_STATE_GET_USAGECOUNT(state) == 0)
  			{
  				if (strategy != NULL)
  					AddBufferToRing(strategy, buf);
+ 				*lockstate = state;
  				return buf;
  			}
! 			UnlockBufHdr(buf, state);
  
  		}
  	}
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 295,300 ****
--- 300,306 ----
  	trycounter = NBuffers;
  	for (;;)
  	{
+ 		uint32	state;
  
  		buf = GetBufferDescriptor(ClockSweepTick());
  
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 302,313 ****
  		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
  		 * it; decrement the usage_count (unless pinned) and keep scanning.
  		 */
! 		LockBufHdr(buf);
! 		if (buf->refcount == 0)
  		{
! 			if (buf->usage_count > 0)
  			{
! 				buf->usage_count--;
  				trycounter = NBuffers;
  			}
  			else
--- 308,321 ----
  		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
  		 * it; decrement the usage_count (unless pinned) and keep scanning.
  		 */
! 		state = LockBufHdr(buf);
! 
! 		if (BUF_STATE_GET_REFCOUNT(state) == 0)
  		{
! 			if (BUF_STATE_GET_USAGECOUNT(state) != 0)
  			{
! 				state -= BUF_USAGECOUNT_ONE;
! 
  				trycounter = NBuffers;
  			}
  			else
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 315,320 ****
--- 323,329 ----
  				/* Found a usable buffer */
  				if (strategy != NULL)
  					AddBufferToRing(strategy, buf);
+ 				*lockstate = state;
  				return buf;
  			}
  		}
*************** StrategyGetBuffer(BufferAccessStrategy s
*** 327,336 ****
  			 * probably better to fail than to risk getting stuck in an
  			 * infinite loop.
  			 */
! 			UnlockBufHdr(buf);
  			elog(ERROR, "no unpinned buffers available");
  		}
! 		UnlockBufHdr(buf);
  	}
  }
  
--- 336,347 ----
  			 * probably better to fail than to risk getting stuck in an
  			 * infinite loop.
  			 */
! 			state &= ~BM_LOCKED;
! 			pg_atomic_write_u32(&buf->state, state);
  			elog(ERROR, "no unpinned buffers available");
  		}
! 		state &= ~BM_LOCKED;
! 		pg_atomic_write_u32(&buf->state, state);
  	}
  }
  
*************** FreeAccessStrategy(BufferAccessStrategy 
*** 585,594 ****
   * The bufhdr spin lock is held on the returned buffer.
   */
  static BufferDesc *
! GetBufferFromRing(BufferAccessStrategy strategy)
  {
  	BufferDesc *buf;
  	Buffer		bufnum;
  
  	/* Advance to next ring slot */
  	if (++strategy->current >= strategy->ring_size)
--- 596,606 ----
   * The bufhdr spin lock is held on the returned buffer.
   */
  static BufferDesc *
! GetBufferFromRing(BufferAccessStrategy strategy, uint32 *lockstate)
  {
  	BufferDesc *buf;
  	Buffer		bufnum;
+ 	uint32		state;
  
  	/* Advance to next ring slot */
  	if (++strategy->current >= strategy->ring_size)
*************** GetBufferFromRing(BufferAccessStrategy s
*** 616,628 ****
  	 * shouldn't re-use it.
  	 */
  	buf = GetBufferDescriptor(bufnum - 1);
! 	LockBufHdr(buf);
! 	if (buf->refcount == 0 && buf->usage_count <= 1)
  	{
  		strategy->current_was_in_ring = true;
  		return buf;
  	}
! 	UnlockBufHdr(buf);
  
  	/*
  	 * Tell caller to allocate a new buffer with the normal allocation
--- 628,642 ----
  	 * shouldn't re-use it.
  	 */
  	buf = GetBufferDescriptor(bufnum - 1);
! 	state = LockBufHdr(buf);
! 	if (BUF_STATE_GET_REFCOUNT(state) == 0
! 		&& BUF_STATE_GET_USAGECOUNT(state) <= 1)
  	{
  		strategy->current_was_in_ring = true;
+ 		*lockstate = state;
  		return buf;
  	}
! 	UnlockBufHdr(buf, state);
  
  	/*
  	 * Tell caller to allocate a new buffer with the normal allocation
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
new file mode 100644
index 17640cf..edc0ada
*** a/src/backend/storage/buffer/localbuf.c
--- b/src/backend/storage/buffer/localbuf.c
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 108,113 ****
--- 108,114 ----
  	int			b;
  	int			trycounter;
  	bool		found;
+ 	uint32		state;
  
  	INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
  
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 128,143 ****
  		fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n",
  				smgr->smgr_rnode.node.relNode, forkNum, blockNum, -b - 1);
  #endif
  		/* this part is equivalent to PinBuffer for a shared buffer */
  		if (LocalRefCount[b] == 0)
  		{
! 			if (bufHdr->usage_count < BM_MAX_USAGE_COUNT)
! 				bufHdr->usage_count++;
  		}
  		LocalRefCount[b]++;
  		ResourceOwnerRememberBuffer(CurrentResourceOwner,
  									BufferDescriptorGetBuffer(bufHdr));
! 		if (bufHdr->flags & BM_VALID)
  			*foundPtr = TRUE;
  		else
  		{
--- 129,149 ----
  		fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n",
  				smgr->smgr_rnode.node.relNode, forkNum, blockNum, -b - 1);
  #endif
+ 		state = pg_atomic_read_u32(&bufHdr->state);
+ 
  		/* this part is equivalent to PinBuffer for a shared buffer */
  		if (LocalRefCount[b] == 0)
  		{
! 			if (BUF_STATE_GET_USAGECOUNT(state) < BM_MAX_USAGE_COUNT)
! 			{
! 				state += BUF_USAGECOUNT_ONE;
! 				pg_atomic_write_u32(&bufHdr->state, state);
! 			}
  		}
  		LocalRefCount[b]++;
  		ResourceOwnerRememberBuffer(CurrentResourceOwner,
  									BufferDescriptorGetBuffer(bufHdr));
! 		if (state & BM_VALID)
  			*foundPtr = TRUE;
  		else
  		{
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 169,177 ****
  
  		if (LocalRefCount[b] == 0)
  		{
! 			if (bufHdr->usage_count > 0)
  			{
! 				bufHdr->usage_count--;
  				trycounter = NLocBuffer;
  			}
  			else
--- 175,186 ----
  
  		if (LocalRefCount[b] == 0)
  		{
! 			state = pg_atomic_read_u32(&bufHdr->state);
! 
! 			if (BUF_STATE_GET_USAGECOUNT(state) > 0)
  			{
! 				state -= BUF_USAGECOUNT_ONE;
! 				pg_atomic_write_u32(&bufHdr->state, state);
  				trycounter = NLocBuffer;
  			}
  			else
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 193,199 ****
  	 * this buffer is not referenced but it might still be dirty. if that's
  	 * the case, write it out before reusing it!
  	 */
! 	if (bufHdr->flags & BM_DIRTY)
  	{
  		SMgrRelation oreln;
  		Page		localpage = (char *) LocalBufHdrGetBlock(bufHdr);
--- 202,208 ----
  	 * this buffer is not referenced but it might still be dirty. if that's
  	 * the case, write it out before reusing it!
  	 */
! 	if (state & BM_DIRTY)
  	{
  		SMgrRelation oreln;
  		Page		localpage = (char *) LocalBufHdrGetBlock(bufHdr);
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 211,217 ****
  				  false);
  
  		/* Mark not-dirty now in case we error out below */
! 		bufHdr->flags &= ~BM_DIRTY;
  
  		pgBufferUsage.local_blks_written++;
  	}
--- 220,227 ----
  				  false);
  
  		/* Mark not-dirty now in case we error out below */
! 		state &= ~BM_DIRTY;
! 		pg_atomic_write_u32(&bufHdr->state, state);
  
  		pgBufferUsage.local_blks_written++;
  	}
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 228,234 ****
  	/*
  	 * Update the hash table: remove old entry, if any, and make new one.
  	 */
! 	if (bufHdr->flags & BM_TAG_VALID)
  	{
  		hresult = (LocalBufferLookupEnt *)
  			hash_search(LocalBufHash, (void *) &bufHdr->tag,
--- 238,244 ----
  	/*
  	 * Update the hash table: remove old entry, if any, and make new one.
  	 */
! 	if (state & BM_TAG_VALID)
  	{
  		hresult = (LocalBufferLookupEnt *)
  			hash_search(LocalBufHash, (void *) &bufHdr->tag,
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 237,243 ****
  			elog(ERROR, "local buffer hash table corrupted");
  		/* mark buffer invalid just in case hash insert fails */
  		CLEAR_BUFFERTAG(bufHdr->tag);
! 		bufHdr->flags &= ~(BM_VALID | BM_TAG_VALID);
  	}
  
  	hresult = (LocalBufferLookupEnt *)
--- 247,254 ----
  			elog(ERROR, "local buffer hash table corrupted");
  		/* mark buffer invalid just in case hash insert fails */
  		CLEAR_BUFFERTAG(bufHdr->tag);
! 		state &= ~(BM_VALID | BM_TAG_VALID);
! 		pg_atomic_write_u32(&bufHdr->state, state);
  	}
  
  	hresult = (LocalBufferLookupEnt *)
*************** LocalBufferAlloc(SMgrRelation smgr, Fork
*** 250,258 ****
  	 * it's all ours now.
  	 */
  	bufHdr->tag = newTag;
! 	bufHdr->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
! 	bufHdr->flags |= BM_TAG_VALID;
! 	bufHdr->usage_count = 1;
  
  	*foundPtr = FALSE;
  	return bufHdr;
--- 261,271 ----
  	 * it's all ours now.
  	 */
  	bufHdr->tag = newTag;
! 	state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
! 	state |= BM_TAG_VALID;
! 	state &= ~BUF_USAGECOUNT_MASK;
! 	state += BUF_USAGECOUNT_ONE;
! 	pg_atomic_write_u32(&bufHdr->state, state);
  
  	*foundPtr = FALSE;
  	return bufHdr;
*************** MarkLocalBufferDirty(Buffer buffer)
*** 267,272 ****
--- 280,286 ----
  {
  	int			bufid;
  	BufferDesc *bufHdr;
+ 	uint32		state;
  
  	Assert(BufferIsLocal(buffer));
  
*************** MarkLocalBufferDirty(Buffer buffer)
*** 280,289 ****
  
  	bufHdr = GetLocalBufferDescriptor(bufid);
  
! 	if (!(bufHdr->flags & BM_DIRTY))
! 		pgBufferUsage.local_blks_dirtied++;
  
! 	bufHdr->flags |= BM_DIRTY;
  }
  
  /*
--- 294,303 ----
  
  	bufHdr = GetLocalBufferDescriptor(bufid);
  
! 	state = pg_atomic_fetch_or_u32(&bufHdr->state, BM_DIRTY);
  
! 	if (!(state & BM_DIRTY))
! 		pgBufferUsage.local_blks_dirtied++;
  }
  
  /*
*************** DropRelFileNodeLocalBuffers(RelFileNode 
*** 307,314 ****
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
  
! 		if ((bufHdr->flags & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
--- 321,331 ----
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
+ 		uint32		state;
  
! 		state = pg_atomic_read_u32(&bufHdr->state);
! 
! 		if ((state & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
  			bufHdr->tag.forkNum == forkNum &&
  			bufHdr->tag.blockNum >= firstDelBlock)
*************** DropRelFileNodeLocalBuffers(RelFileNode 
*** 327,334 ****
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			bufHdr->flags = 0;
! 			bufHdr->usage_count = 0;
  		}
  	}
  }
--- 344,352 ----
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			state &= ~BUF_FLAG_MASK;
! 			state &= ~BUF_USAGECOUNT_MASK;
! 			pg_atomic_write_u32(&bufHdr->state, state);
  		}
  	}
  }
*************** DropRelFileNodeAllLocalBuffers(RelFileNo
*** 349,356 ****
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
  
! 		if ((bufHdr->flags & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode))
  		{
  			if (LocalRefCount[i] != 0)
--- 367,377 ----
  	{
  		BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
  		LocalBufferLookupEnt *hresult;
+ 		uint32		state;
  
! 		state = pg_atomic_read_u32(&bufHdr->state);
! 
! 		if ((state & BM_TAG_VALID) &&
  			RelFileNodeEquals(bufHdr->tag.rnode, rnode))
  		{
  			if (LocalRefCount[i] != 0)
*************** DropRelFileNodeAllLocalBuffers(RelFileNo
*** 367,374 ****
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			bufHdr->flags = 0;
! 			bufHdr->usage_count = 0;
  		}
  	}
  }
--- 388,396 ----
  				elog(ERROR, "local buffer hash table corrupted");
  			/* Mark buffer invalid */
  			CLEAR_BUFFERTAG(bufHdr->tag);
! 			state &= ~BUF_FLAG_MASK;
! 			state &= ~BUF_USAGECOUNT_MASK;
! 			pg_atomic_write_u32(&bufHdr->state, state);
  		}
  	}
  }
diff --git a/src/backend/storage/lmgr/s_lock.c b/src/backend/storage/lmgr/s_lock.c
new file mode 100644
index cc0bf5e..5ebd4dc
*** a/src/backend/storage/lmgr/s_lock.c
--- b/src/backend/storage/lmgr/s_lock.c
*************** static int	spins_per_delay = DEFAULT_SPI
*** 30,146 ****
   * s_lock_stuck() - complain about a stuck spinlock
   */
  static void
! s_lock_stuck(volatile slock_t *lock, const char *file, int line)
  {
  #if defined(S_LOCK_TEST)
  	fprintf(stderr,
  			"\nStuck spinlock (%p) detected at %s:%d.\n",
! 			lock, file, line);
  	exit(1);
  #else
  	elog(PANIC, "stuck spinlock (%p) detected at %s:%d",
! 		 lock, file, line);
  #endif
  }
  
- 
  /*
!  * s_lock(lock) - platform-independent portion of waiting for a spinlock.
   */
- int
- s_lock(volatile slock_t *lock, const char *file, int line)
- {
- 	/*
- 	 * We loop tightly for awhile, then delay using pg_usleep() and try again.
- 	 * Preferably, "awhile" should be a small multiple of the maximum time we
- 	 * expect a spinlock to be held.  100 iterations seems about right as an
- 	 * initial guess.  However, on a uniprocessor the loop is a waste of
- 	 * cycles, while in a multi-CPU scenario it's usually better to spin a bit
- 	 * longer than to call the kernel, so we try to adapt the spin loop count
- 	 * depending on whether we seem to be in a uniprocessor or multiprocessor.
- 	 *
- 	 * Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd
- 	 * be wrong; there are platforms where that can result in a "stuck
- 	 * spinlock" failure.  This has been seen particularly on Alphas; it seems
- 	 * that the first TAS after returning from kernel space will always fail
- 	 * on that hardware.
- 	 *
- 	 * Once we do decide to block, we use randomly increasing pg_usleep()
- 	 * delays. The first delay is 1 msec, then the delay randomly increases to
- 	 * about one second, after which we reset to 1 msec and start again.  The
- 	 * idea here is that in the presence of heavy contention we need to
- 	 * increase the delay, else the spinlock holder may never get to run and
- 	 * release the lock.  (Consider situation where spinlock holder has been
- 	 * nice'd down in priority by the scheduler --- it will not get scheduled
- 	 * until all would-be acquirers are sleeping, so if we always use a 1-msec
- 	 * sleep, there is a real possibility of starvation.)  But we can't just
- 	 * clamp the delay to an upper bound, else it would take a long time to
- 	 * make a reasonable number of tries.
- 	 *
- 	 * We time out and declare error after NUM_DELAYS delays (thus, exactly
- 	 * that many tries).  With the given settings, this will usually take 2 or
- 	 * so minutes.  It seems better to fix the total number of tries (and thus
- 	 * the probability of unintended failure) than to fix the total time
- 	 * spent.
- 	 */
  #define MIN_SPINS_PER_DELAY 10
  #define MAX_SPINS_PER_DELAY 1000
  #define NUM_DELAYS			1000
  #define MIN_DELAY_USEC		1000L
  #define MAX_DELAY_USEC		1000000L
  
! 	int			spins = 0;
! 	int			delays = 0;
! 	int			cur_delay = 0;
  
! 	while (TAS_SPIN(lock))
  	{
! 		/* CPU-specific delay each time through the loop */
! 		SPIN_DELAY();
! 
! 		/* Block the process every spins_per_delay tries */
! 		if (++spins >= spins_per_delay)
! 		{
! 			if (++delays > NUM_DELAYS)
! 				s_lock_stuck(lock, file, line);
  
! 			if (cur_delay == 0) /* first time to delay? */
! 				cur_delay = MIN_DELAY_USEC;
  
! 			pg_usleep(cur_delay);
  
  #if defined(S_LOCK_TEST)
! 			fprintf(stdout, "*");
! 			fflush(stdout);
  #endif
  
! 			/* increase delay by a random fraction between 1X and 2X */
! 			cur_delay += (int) (cur_delay *
! 					  ((double) random() / (double) MAX_RANDOM_VALUE) + 0.5);
! 			/* wrap back to minimum delay when max is exceeded */
! 			if (cur_delay > MAX_DELAY_USEC)
! 				cur_delay = MIN_DELAY_USEC;
  
! 			spins = 0;
! 		}
  	}
  
! 	/*
! 	 * If we were able to acquire the lock without delaying, it's a good
! 	 * indication we are in a multiprocessor.  If we had to delay, it's a sign
! 	 * (but not a sure thing) that we are in a uniprocessor. Hence, we
! 	 * decrement spins_per_delay slowly when we had to delay, and increase it
! 	 * rapidly when we didn't.  It's expected that spins_per_delay will
! 	 * converge to the minimum value on a uniprocessor and to the maximum
! 	 * value on a multiprocessor.
! 	 *
! 	 * Note: spins_per_delay is local within our current process. We want to
! 	 * average these observations across multiple backends, since it's
! 	 * relatively rare for this function to even get entered, and so a single
! 	 * backend might not live long enough to converge on a good value.  That
! 	 * is handled by the two routines below.
! 	 */
! 	if (cur_delay == 0)
  	{
  		/* we never had to delay */
  		if (spins_per_delay < MAX_SPINS_PER_DELAY)
--- 30,139 ----
   * s_lock_stuck() - complain about a stuck spinlock
   */
  static void
! s_lock_stuck(Pointer p, const char *file, int line)
  {
  #if defined(S_LOCK_TEST)
  	fprintf(stderr,
  			"\nStuck spinlock (%p) detected at %s:%d.\n",
! 			p, file, line);
  	exit(1);
  #else
  	elog(PANIC, "stuck spinlock (%p) detected at %s:%d",
! 		 p, file, line);
  #endif
  }
  
  /*
!  * We loop tightly for awhile, then delay using pg_usleep() and try again.
!  * Preferably, "awhile" should be a small multiple of the maximum time we
!  * expect a spinlock to be held.  100 iterations seems about right as an
!  * initial guess.  However, on a uniprocessor the loop is a waste of
!  * cycles, while in a multi-CPU scenario it's usually better to spin a bit
!  * longer than to call the kernel, so we try to adapt the spin loop count
!  * depending on whether we seem to be in a uniprocessor or multiprocessor.
!  *
!  * Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd
!  * be wrong; there are platforms where that can result in a "stuck
!  * spinlock" failure.  This has been seen particularly on Alphas; it seems
!  * that the first TAS after returning from kernel space will always fail
!  * on that hardware.
!  *
!  * Once we do decide to block, we use randomly increasing pg_usleep()
!  * delays. The first delay is 1 msec, then the delay randomly increases to
!  * about one second, after which we reset to 1 msec and start again.  The
!  * idea here is that in the presence of heavy contention we need to
!  * increase the delay, else the spinlock holder may never get to run and
!  * release the lock.  (Consider situation where spinlock holder has been
!  * nice'd down in priority by the scheduler --- it will not get scheduled
!  * until all would-be acquirers are sleeping, so if we always use a 1-msec
!  * sleep, there is a real possibility of starvation.)  But we can't just
!  * clamp the delay to an upper bound, else it would take a long time to
!  * make a reasonable number of tries.
!  *
!  * We time out and declare error after NUM_DELAYS delays (thus, exactly
!  * that many tries).  With the given settings, this will usually take 2 or
!  * so minutes.  It seems better to fix the total number of tries (and thus
!  * the probability of unintended failure) than to fix the total time
!  * spent.
   */
  #define MIN_SPINS_PER_DELAY 10
  #define MAX_SPINS_PER_DELAY 1000
  #define NUM_DELAYS			1000
  #define MIN_DELAY_USEC		1000L
  #define MAX_DELAY_USEC		1000000L
  
! void
! perform_spin_delay(SpinDelayStatus *status)
! {
! 	/* CPU-specific delay each time through the loop */
! 	SPIN_DELAY();
  
! 	/* Block the process every spins_per_delay tries */
! 	if (++(status->spins) >= spins_per_delay)
  	{
! 		if (++(status->delays) > NUM_DELAYS)
! 			s_lock_stuck(status->ptr, status->file, status->line);
  
! 		if (status->cur_delay == 0) /* first time to delay? */
! 			status->cur_delay = MIN_DELAY_USEC;
  
! 		pg_usleep(status->cur_delay);
  
  #if defined(S_LOCK_TEST)
! 		fprintf(stdout, "*");
! 		fflush(stdout);
  #endif
  
! 		/* increase delay by a random fraction between 1X and 2X */
! 		status->cur_delay += (int) (status->cur_delay *
! 				  ((double) random() / (double) MAX_RANDOM_VALUE) + 0.5);
! 		/* wrap back to minimum delay when max is exceeded */
! 		if (status->cur_delay > MAX_DELAY_USEC)
! 			status->cur_delay = MIN_DELAY_USEC;
  
! 		status->spins = 0;
  	}
+ }
  
! /*
!  * If we were able to acquire the lock without delaying, it's a good
!  * indication we are in a multiprocessor.  If we had to delay, it's a sign
!  * (but not a sure thing) that we are in a uniprocessor. Hence, we
!  * decrement spins_per_delay slowly when we had to delay, and increase it
!  * rapidly when we didn't.  It's expected that spins_per_delay will
!  * converge to the minimum value on a uniprocessor and to the maximum
!  * value on a multiprocessor.
!  *
!  * Note: spins_per_delay is local within our current process. We want to
!  * average these observations across multiple backends, since it's
!  * relatively rare for this function to even get entered, and so a single
!  * backend might not live long enough to converge on a good value.  That
!  * is handled by the two routines below.
!  */
! void
! finish_spin_delay(SpinDelayStatus *status)
! {
! 	if (status->cur_delay == 0)
  	{
  		/* we never had to delay */
  		if (spins_per_delay < MAX_SPINS_PER_DELAY)
*************** s_lock(volatile slock_t *lock, const cha
*** 151,157 ****
  		if (spins_per_delay > MIN_SPINS_PER_DELAY)
  			spins_per_delay = Max(spins_per_delay - 1, MIN_SPINS_PER_DELAY);
  	}
! 	return delays;
  }
  
  #ifdef USE_DEFAULT_S_UNLOCK
--- 144,167 ----
  		if (spins_per_delay > MIN_SPINS_PER_DELAY)
  			spins_per_delay = Max(spins_per_delay - 1, MIN_SPINS_PER_DELAY);
  	}
! }
! 
! /*
!  * s_lock(lock) - platform-independent portion of waiting for a spinlock.
!  */
! int
! s_lock(volatile slock_t *lock, const char *file, int line)
! {
! 	SpinDelayStatus delayStatus = init_spin_delay((Pointer)lock);
! 
! 	while (TAS_SPIN(lock))
! 	{
! 		perform_spin_delay(&delayStatus);
! 	}
! 
! 	finish_spin_delay(&delayStatus);
! 
! 	return delayStatus.delays;
  }
  
  #ifdef USE_DEFAULT_S_UNLOCK
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
new file mode 100644
index d04363b..e967b84
*** a/src/include/storage/buf_internals.h
--- b/src/include/storage/buf_internals.h
***************
*** 21,49 ****
  #include "storage/lwlock.h"
  #include "storage/shmem.h"
  #include "storage/smgr.h"
  #include "storage/spin.h"
  #include "utils/relcache.h"
  
  
  /*
   * Flags for buffer descriptors
   *
   * Note: TAG_VALID essentially means that there is a buffer hashtable
   * entry associated with the buffer's tag.
   */
! #define BM_DIRTY				(1 << 0)		/* data needs writing */
! #define BM_VALID				(1 << 1)		/* data is valid */
! #define BM_TAG_VALID			(1 << 2)		/* tag is assigned */
! #define BM_IO_IN_PROGRESS		(1 << 3)		/* read or write in progress */
! #define BM_IO_ERROR				(1 << 4)		/* previous I/O failed */
! #define BM_JUST_DIRTIED			(1 << 5)		/* dirtied since write started */
! #define BM_PIN_COUNT_WAITER		(1 << 6)		/* have waiter for sole pin */
! #define BM_CHECKPOINT_NEEDED	(1 << 7)		/* must write for checkpoint */
! #define BM_PERMANENT			(1 << 8)		/* permanent relation (not
  												 * unlogged) */
- 
- typedef bits16 BufFlags;
- 
  /*
   * The maximum allowed value of usage_count represents a tradeoff between
   * accuracy and speed of the clock-sweep buffer management algorithm.  A
--- 21,69 ----
  #include "storage/lwlock.h"
  #include "storage/shmem.h"
  #include "storage/smgr.h"
+ #include "port/atomics.h"
  #include "storage/spin.h"
  #include "utils/relcache.h"
  
  
  /*
+  * Buffer state is a single 32-bit variable where following data is combined.
+  *
+  * - 18 bits refcount
+  * - 4 bits usage count
+  * - 10 bits of flags
+  *
+  * Such layout allows us to perform some operation more efficiently.
+  * The definition of buffer state parts is below.
+  */
+ #define BUF_REFCOUNT_ONE 1
+ #define BUF_REFCOUNT_MASK ((1U << 18) - 1)
+ #define BUF_USAGECOUNT_MASK 0x003C0000U
+ #define BUF_USAGECOUNT_ONE (1U << 18)
+ #define BUF_USAGECOUNT_SHIFT 18
+ #define BUF_FLAG_MASK 0xFFC00000U
+ 
+ /* Get refcount and usagecount from buffer state */
+ #define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK)
+ #define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT)
+ 
+ /*
   * Flags for buffer descriptors
   *
   * Note: TAG_VALID essentially means that there is a buffer hashtable
   * entry associated with the buffer's tag.
   */
! #define BM_LOCKED				(1U << 22)		/* buffer header is locked */
! #define BM_DIRTY				(1U << 23)		/* data needs writing */
! #define BM_VALID				(1U << 24)		/* data is valid */
! #define BM_TAG_VALID			(1U << 25)		/* tag is assigned */
! #define BM_IO_IN_PROGRESS		(1U << 26)		/* read or write in progress */
! #define BM_IO_ERROR				(1U << 27)		/* previous I/O failed */
! #define BM_JUST_DIRTIED			(1U << 28)		/* dirtied since write started */
! #define BM_PIN_COUNT_WAITER		(1U << 29)		/* have waiter for sole pin */
! #define BM_CHECKPOINT_NEEDED	(1U << 30)		/* must write for checkpoint */
! #define BM_PERMANENT			(1U << 31)		/* permanent relation (not
  												 * unlogged) */
  /*
   * The maximum allowed value of usage_count represents a tradeoff between
   * accuracy and speed of the clock-sweep buffer management algorithm.  A
*************** typedef struct buftag
*** 113,130 ****
  /*
   *	BufferDesc -- shared descriptor/state data for a single shared buffer.
   *
!  * Note: buf_hdr_lock must be held to examine or change the tag, flags,
!  * usage_count, refcount, or wait_backend_pid fields.  buf_id field never
!  * changes after initialization, so does not need locking.  freeNext is
!  * protected by the buffer_strategy_lock not buf_hdr_lock.  The LWLock can
!  * take care of itself.  The buf_hdr_lock is *not* used to control access to
!  * the data in the buffer!
   *
   * An exception is that if we have the buffer pinned, its tag can't change
!  * underneath us, so we can examine the tag without locking the spinlock.
   * Also, in places we do one-time reads of the flags without bothering to
!  * lock the spinlock; this is generally for situations where we don't expect
!  * the flag bit being tested to be changing.
   *
   * We can't physically remove items from a disk page if another backend has
   * the buffer pinned.  Hence, a backend may need to wait for all other pins
--- 133,161 ----
  /*
   *	BufferDesc -- shared descriptor/state data for a single shared buffer.
   *
!  * Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change
!  * the tag, state or wait_backend_pid fields.  In general, buffer header lock
!  * is a spinlock which is combined with flags, refcount and usagecount into
!  * single atomic variable.  This layout allow us to do some operations in single
!  * CAS without actually acquiring and releasing a spinlock; for instance,
!  * increase or decrease refcount.  buf_id field never changes after
!  * initialization, so does not need locking.  freeNext is protected by the
!  * buffer_strategy_lock not buffer header lock.  The LWLock can take care of
!  * itself.  The buffer header lock is *not* used to control access to the data
!  * in the buffer!
!  *
!  * It's assumed that nobody changes the state field while buffer header lock
!  * is held.  Thanks to it, buffer header lock holder can do complex updates of
!  * state variable in single write simultaneously with lock release (cleaning
!  * BM_LOCKED flag).  On the other hand, updating of state without holding
!  * buffer header lock is restricted to CAS which insure that BM_LOCKED flag
!  * is not set.  Atomic increment/decrement, OR/AND etc are not allowed.
   *
   * An exception is that if we have the buffer pinned, its tag can't change
!  * underneath us, so we can examine the tag without locking the buffer header.
   * Also, in places we do one-time reads of the flags without bothering to
!  * lock the buffer header; this is generally for situations where we don't
!  * expect the flag bit being tested to be changing.
   *
   * We can't physically remove items from a disk page if another backend has
   * the buffer pinned.  Hence, a backend may need to wait for all other pins
*************** typedef struct buftag
*** 142,154 ****
  typedef struct BufferDesc
  {
  	BufferTag	tag;			/* ID of page contained in buffer */
- 	BufFlags	flags;			/* see bit definitions above */
- 	uint8		usage_count;	/* usage counter for clock sweep code */
- 	slock_t		buf_hdr_lock;	/* protects a subset of fields, see above */
- 	unsigned	refcount;		/* # of backends holding pins on buffer */
- 	int			wait_backend_pid;		/* backend PID of pin-count waiter */
- 
  	int			buf_id;			/* buffer's index number (from 0) */
  	int			freeNext;		/* link in freelist chain */
  
  	LWLock		content_lock;	/* to lock access to buffer contents */
--- 173,184 ----
  typedef struct BufferDesc
  {
  	BufferTag	tag;			/* ID of page contained in buffer */
  	int			buf_id;			/* buffer's index number (from 0) */
+ 
+ 	/* state of the tag, containing flags, refcount and usagecount */
+ 	pg_atomic_uint32 state;
+ 
+ 	int			wait_backend_pid;		/* backend PID of pin-count waiter */
  	int			freeNext;		/* link in freelist chain */
  
  	LWLock		content_lock;	/* to lock access to buffer contents */
*************** extern PGDLLIMPORT LWLockMinimallyPadded
*** 202,212 ****
  #define FREENEXT_NOT_IN_LIST	(-2)
  
  /*
!  * Macros for acquiring/releasing a shared buffer header's spinlock.
!  * Do not apply these to local buffers!
   */
! #define LockBufHdr(bufHdr)		SpinLockAcquire(&(bufHdr)->buf_hdr_lock)
! #define UnlockBufHdr(bufHdr)	SpinLockRelease(&(bufHdr)->buf_hdr_lock)
  
  
  /*
--- 232,246 ----
  #define FREENEXT_NOT_IN_LIST	(-2)
  
  /*
!  * Functions for acquiring/releasing a shared buffer header's spinlock.  Do
!  * not apply these to local buffers!
   */
! extern uint32 LockBufHdr(BufferDesc *desc);
! #define UnlockBufHdr(desc, s)	\
! 	do {	\
! 		pg_atomic_write_u32(&(desc)->state, (s) & (~BM_LOCKED)); \
! 		pg_write_barrier(); \
! 	} while (0)
  
  
  /*
*************** extern void IssuePendingWritebacks(Write
*** 267,273 ****
  extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag);
  
  /* freelist.c */
! extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy);
  extern void StrategyFreeBuffer(BufferDesc *buf);
  extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
  					 BufferDesc *buf);
--- 301,308 ----
  extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag);
  
  /* freelist.c */
! extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
! 											  uint32 *state);
  extern void StrategyFreeBuffer(BufferDesc *buf);
  extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
  					 BufferDesc *buf);
diff --git a/src/include/storage/s_lock.h b/src/include/storage/s_lock.h
new file mode 100644
index 8b240cd..fd49abf
*** a/src/include/storage/s_lock.h
--- b/src/include/storage/s_lock.h
*************** extern int s_lock(volatile slock_t *lock
*** 991,994 ****
--- 991,1013 ----
  extern void set_spins_per_delay(int shared_spins_per_delay);
  extern int	update_spins_per_delay(int shared_spins_per_delay);
  
+ /*
+  * Support for spin delay which could be useful in other places where
+  * spinlock-like procedures take place.
+  */
+ typedef struct
+ {
+ 	int			spins;
+ 	int			delays;
+ 	int			cur_delay;
+ 	Pointer		ptr;
+ 	const char *file;
+ 	int			line;
+ } SpinDelayStatus;
+ 
+ #define init_spin_delay(ptr) {0, 0, 0, (ptr), __FILE__, __LINE__}
+ 
+ void perform_spin_delay(SpinDelayStatus *status);
+ void finish_spin_delay(SpinDelayStatus *status);
+ 
  #endif	 /* S_LOCK_H */
