Hello hackers!

Continuing the theme: 
http://www.postgresql.org/message-id/3368228.mTSz6V0Jsq@dinodell

This time, we fairly rewrote  'refcount' and 'usage_count' to atomic in 
PinBuffer and UnpinBuffer (but save lock for buffer flags in Unpin).

In the same time it doesn't affect to correctness of buffer manager
because that variables already have LWLock on top of them (for partition of 
hashtable). If someone pinned buffer after the call StrategyGetBuffer we just 
try again (in BufferAlloc).  Also in the code there is one more check before 
deleting the old buffer, where changes can be rolled back. The other functions 
where it is checked 'refcount' and 'usage_count' put exclusive locks. 

Also stress test with 256 KB shared memory ended successfully.

Without patch we have 417523 TPS and with patch 965821 TPS for big x86 server. 
All details here: https://gist.github.com/stalkerg/773a81b79a27b4d5d63f

Thank you. 
-- 
YUriy Zhuravlev
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index 6622d22..50ca2a5 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -33,14 +33,14 @@ typedef struct
 	BlockNumber blocknum;
 	bool		isvalid;
 	bool		isdirty;
-	uint16		usagecount;
+	uint32		usagecount;
 
 	/*
 	 * An int32 is sufficiently large, as MAX_BACKENDS prevents a buffer from
 	 * being pinned by too many backends and each backend will only pin once
 	 * because of bufmgr.c's PrivateRefCount infrastructure.
 	 */
-	int32		pinning_backends;
+	uint32		pinning_backends;
 } BufferCachePagesRec;
 
 
@@ -160,8 +160,8 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 			fctx->record[i].reldatabase = bufHdr->tag.rnode.dbNode;
 			fctx->record[i].forknum = bufHdr->tag.forkNum;
 			fctx->record[i].blocknum = bufHdr->tag.blockNum;
-			fctx->record[i].usagecount = bufHdr->usage_count;
-			fctx->record[i].pinning_backends = bufHdr->refcount;
+			fctx->record[i].usagecount = pg_atomic_read_u32(&bufHdr->usage_count);
+			fctx->record[i].pinning_backends = pg_atomic_read_u32(&bufHdr->refcount);
 
 			if (bufHdr->flags & BM_DIRTY)
 				fctx->record[i].isdirty = true;
@@ -236,7 +236,7 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 			values[7] = Int16GetDatum(fctx->record[i].usagecount);
 			nulls[7] = false;
 			/* unused for v1.0 callers, but the array is always long enough */
-			values[8] = Int32GetDatum(fctx->record[i].pinning_backends);
+			values[8] = UInt32GetDatum(fctx->record[i].pinning_backends);
 			nulls[8] = false;
 		}
 
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index 3ae2848..e139a7c 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -96,8 +96,8 @@ InitBufferPool(void)
 
 			CLEAR_BUFFERTAG(buf->tag);
 			buf->flags = 0;
-			buf->usage_count = 0;
-			buf->refcount = 0;
+			pg_atomic_init_u32(&buf->usage_count, 0);
+			pg_atomic_init_u32(&buf->refcount, 0);
 			buf->wait_backend_pid = 0;
 
 			SpinLockInit(&buf->buf_hdr_lock);
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 8c0358e..afba360 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -962,7 +962,6 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		 * into the buffer.
 		 */
 		buf = GetBufferDescriptor(buf_id);
-
 		valid = PinBuffer(buf, strategy);
 
 		/* Can release the mapping lock as soon as we've pinned it */
@@ -1013,7 +1012,15 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		 */
 		buf = StrategyGetBuffer(strategy);
 
-		Assert(buf->refcount == 0);
+		/*
+		 * Ok, we can skip this but then we have to remove new buffer from
+		 * hash table. Better to just try again.
+		 */
+		if (pg_atomic_read_u32(&buf->refcount) != 0)
+		{
+			UnlockBufHdr(buf);
+			continue;
+		}
 
 		/* Must copy buffer flags while we still hold the spinlock */
 		oldFlags = buf->flags;
@@ -1211,7 +1218,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		 * over with a new victim buffer.
 		 */
 		oldFlags = buf->flags;
-		if (buf->refcount == 1 && !(oldFlags & BM_DIRTY))
+		if (pg_atomic_read_u32(&buf->refcount) == 1 && !(oldFlags & BM_DIRTY))
 			break;
 
 		UnlockBufHdr(buf);
@@ -1234,10 +1241,10 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	buf->tag = newTag;
 	buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT);
 	if (relpersistence == RELPERSISTENCE_PERMANENT)
-		buf->flags |= BM_TAG_VALID | BM_PERMANENT;
+		buf->flags|= BM_TAG_VALID | BM_PERMANENT;
 	else
 		buf->flags |= BM_TAG_VALID;
-	buf->usage_count = 1;
+	pg_atomic_write_u32(&buf->usage_count, 1);
 
 	UnlockBufHdr(buf);
 
@@ -1329,7 +1336,7 @@ retry:
 	 * yet done StartBufferIO, WaitIO will fall through and we'll effectively
 	 * be busy-looping here.)
 	 */
-	if (buf->refcount != 0)
+	if (pg_atomic_read_u32(&buf->refcount) != 0)
 	{
 		UnlockBufHdr(buf);
 		LWLockRelease(oldPartitionLock);
@@ -1347,7 +1354,7 @@ retry:
 	oldFlags = buf->flags;
 	CLEAR_BUFFERTAG(buf->tag);
 	buf->flags = 0;
-	buf->usage_count = 0;
+	pg_atomic_write_u32(&buf->usage_count, 0);
 
 	UnlockBufHdr(buf);
 
@@ -1399,7 +1406,7 @@ MarkBufferDirty(Buffer buffer)
 
 	LockBufHdr(bufHdr);
 
-	Assert(bufHdr->refcount > 0);
+	Assert(pg_atomic_read_u32(&bufHdr->refcount) > 0);
 
 	/*
 	 * If the buffer was not dirty already, do vacuum accounting.
@@ -1498,20 +1505,23 @@ PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy)
 		ReservePrivateRefCountEntry();
 		ref = NewPrivateRefCountEntry(b);
 
-		LockBufHdr(buf);
-		buf->refcount++;
+		pg_atomic_add_fetch_u32(&buf->refcount, 1);
+
 		if (strategy == NULL)
 		{
-			if (buf->usage_count < BM_MAX_USAGE_COUNT)
-				buf->usage_count++;
+			uint32 expect = pg_atomic_read_u32(&buf->usage_count);
+			while (expect < BM_MAX_USAGE_COUNT)
+			{
+				if (pg_atomic_compare_exchange_u32(&buf->usage_count, &expect, expect+1))
+					break;
+			}
 		}
 		else
 		{
-			if (buf->usage_count == 0)
-				buf->usage_count = 1;
+			uint32 expect = 0;
+			pg_atomic_compare_exchange_u32(&buf->usage_count, &expect, 1);
 		}
 		result = (buf->flags & BM_VALID) != 0;
-		UnlockBufHdr(buf);
 	}
 	else
 	{
@@ -1558,7 +1568,7 @@ PinBuffer_Locked(volatile BufferDesc *buf)
 	 */
 	Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
 
-	buf->refcount++;
+	pg_atomic_add_fetch_u32(&buf->refcount, 1);
 	UnlockBufHdr(buf);
 
 	b = BufferDescriptorGetBuffer(buf);
@@ -1598,15 +1608,14 @@ UnpinBuffer(volatile BufferDesc *buf, bool fixOwner)
 		Assert(!LWLockHeldByMe(buf->content_lock));
 		Assert(!LWLockHeldByMe(buf->io_in_progress_lock));
 
-		LockBufHdr(buf);
-
 		/* Decrement the shared reference count */
-		Assert(buf->refcount > 0);
-		buf->refcount--;
+		Assert(pg_atomic_read_u32(&buf->refcount) > 0);
+		pg_atomic_sub_fetch_u32(&buf->refcount, 1);
 
+		LockBufHdr(buf);
 		/* Support LockBufferForCleanup() */
 		if ((buf->flags & BM_PIN_COUNT_WAITER) &&
-			buf->refcount == 1)
+			pg_atomic_read_u32(&buf->refcount) == 1)
 		{
 			/* we just released the last pin other than the waiter's */
 			int			wait_backend_pid = buf->wait_backend_pid;
@@ -2095,7 +2104,7 @@ SyncOneBuffer(int buf_id, bool skip_recently_used)
 	 */
 	LockBufHdr(bufHdr);
 
-	if (bufHdr->refcount == 0 && bufHdr->usage_count == 0)
+	if (pg_atomic_read_u32(&bufHdr->refcount) == 0 && pg_atomic_read_u32(&bufHdr->usage_count) == 0)
 		result |= BUF_REUSABLE;
 	else if (skip_recently_used)
 	{
@@ -2278,7 +2287,7 @@ PrintBufferLeakWarning(Buffer buffer)
 		 "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
 		 buffer, path,
 		 buf->tag.blockNum, buf->flags,
-		 buf->refcount, loccount);
+		 pg_atomic_read_u32(&buf->refcount), loccount);
 	pfree(path);
 }
 
@@ -2809,7 +2818,7 @@ PrintBufferDescs(void)
 			 i, buf->freeNext,
 		  relpathbackend(buf->tag.rnode, InvalidBackendId, buf->tag.forkNum),
 			 buf->tag.blockNum, buf->flags,
-			 buf->refcount, GetPrivateRefCount(b));
+			 pg_atomic_read_u32(&buf->refcount), GetPrivateRefCount(b));
 	}
 }
 #endif
@@ -2834,7 +2843,7 @@ PrintPinnedBufs(void)
 				 i, buf->freeNext,
 				 relpathperm(buf->tag.rnode, buf->tag.forkNum),
 				 buf->tag.blockNum, buf->flags,
-				 buf->refcount, GetPrivateRefCount(b));
+				 pg_atomic_read_u32(&buf->refcount), GetPrivateRefCount(b));
 		}
 	}
 }
@@ -3149,7 +3158,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
 		}
 
 		LockBufHdr(bufHdr);
-		Assert(bufHdr->refcount > 0);
+		Assert(pg_atomic_read_u32(&bufHdr->refcount) > 0);
 		if (!(bufHdr->flags & BM_DIRTY))
 		{
 			dirtied = true;		/* Means "will be dirtied by this action" */
@@ -3307,8 +3316,8 @@ LockBufferForCleanup(Buffer buffer)
 		/* Try to acquire lock */
 		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 		LockBufHdr(bufHdr);
-		Assert(bufHdr->refcount > 0);
-		if (bufHdr->refcount == 1)
+		Assert(pg_atomic_read_u32(&bufHdr->refcount) > 0);
+		if (pg_atomic_read_u32(&bufHdr->refcount) == 1)
 		{
 			/* Successfully acquired exclusive lock with pincount 1 */
 			UnlockBufHdr(bufHdr);
@@ -3417,8 +3426,8 @@ ConditionalLockBufferForCleanup(Buffer buffer)
 
 	bufHdr = GetBufferDescriptor(buffer - 1);
 	LockBufHdr(bufHdr);
-	Assert(bufHdr->refcount > 0);
-	if (bufHdr->refcount == 1)
+	Assert(pg_atomic_read_u32(&bufHdr->refcount) > 0);
+	if (pg_atomic_read_u32(&bufHdr->refcount) == 1)
 	{
 		/* Successfully acquired exclusive lock with pincount 1 */
 		UnlockBufHdr(bufHdr);
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index bc2c773..4461271 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -280,14 +280,13 @@ StrategyGetBuffer(BufferAccessStrategy strategy)
 			 * of 8.3, but we'd better check anyway.)
 			 */
 			LockBufHdr(buf);
-			if (buf->refcount == 0 && buf->usage_count == 0)
+			if (pg_atomic_read_u32(&buf->refcount) == 0 && pg_atomic_read_u32(&buf->usage_count) == 0)
 			{
 				if (strategy != NULL)
 					AddBufferToRing(strategy, buf);
 				return buf;
 			}
 			UnlockBufHdr(buf);
-
 		}
 	}
 
@@ -303,11 +302,11 @@ StrategyGetBuffer(BufferAccessStrategy strategy)
 		 * it; decrement the usage_count (unless pinned) and keep scanning.
 		 */
 		LockBufHdr(buf);
-		if (buf->refcount == 0)
+		if (pg_atomic_read_u32(&buf->refcount) == 0)
 		{
-			if (buf->usage_count > 0)
+			if (buf->usage_count.value > 0)
 			{
-				buf->usage_count--;
+				buf->usage_count.value--;
 				trycounter = NBuffers;
 			}
 			else
@@ -617,7 +616,7 @@ GetBufferFromRing(BufferAccessStrategy strategy)
 	 */
 	buf = GetBufferDescriptor(bufnum - 1);
 	LockBufHdr(buf);
-	if (buf->refcount == 0 && buf->usage_count <= 1)
+	if (pg_atomic_read_u32(&buf->refcount) == 0 && pg_atomic_read_u32(&buf->usage_count) <= 1)
 	{
 		strategy->current_was_in_ring = true;
 		return buf;
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index 3144afe..e1932f5 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -131,8 +131,8 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 		/* this part is equivalent to PinBuffer for a shared buffer */
 		if (LocalRefCount[b] == 0)
 		{
-			if (bufHdr->usage_count < BM_MAX_USAGE_COUNT)
-				bufHdr->usage_count++;
+			if (pg_atomic_read_u32(&bufHdr->usage_count) < BM_MAX_USAGE_COUNT)
+				pg_atomic_add_fetch_u32(&bufHdr->usage_count, 1);
 		}
 		LocalRefCount[b]++;
 		ResourceOwnerRememberBuffer(CurrentResourceOwner,
@@ -169,9 +169,9 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 
 		if (LocalRefCount[b] == 0)
 		{
-			if (bufHdr->usage_count > 0)
+			if (pg_atomic_read_u32(&bufHdr->usage_count) > 0)
 			{
-				bufHdr->usage_count--;
+				pg_atomic_fetch_sub_u32(&bufHdr->usage_count, 1);
 				trycounter = NLocBuffer;
 			}
 			else
@@ -252,7 +252,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 	bufHdr->tag = newTag;
 	bufHdr->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
 	bufHdr->flags |= BM_TAG_VALID;
-	bufHdr->usage_count = 1;
+	pg_atomic_write_u32(&bufHdr->usage_count, 1);
 
 	*foundPtr = FALSE;
 	return bufHdr;
@@ -328,7 +328,7 @@ DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum,
 			/* Mark buffer invalid */
 			CLEAR_BUFFERTAG(bufHdr->tag);
 			bufHdr->flags = 0;
-			bufHdr->usage_count = 0;
+			pg_atomic_write_u32(&bufHdr->usage_count, 0);
 		}
 	}
 }
@@ -368,7 +368,7 @@ DropRelFileNodeAllLocalBuffers(RelFileNode rnode)
 			/* Mark buffer invalid */
 			CLEAR_BUFFERTAG(bufHdr->tag);
 			bufHdr->flags = 0;
-			bufHdr->usage_count = 0;
+			pg_atomic_write_u32(&bufHdr->usage_count, 0);
 		}
 	}
 }
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 521ee1c..68cbbf4 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -137,9 +137,9 @@ typedef struct buftag
 typedef struct BufferDesc
 {
 	BufferTag	tag;			/* ID of page contained in buffer */
-	BufFlags	flags;			/* see bit definitions above */
-	uint16		usage_count;	/* usage counter for clock sweep code */
-	unsigned	refcount;		/* # of backends holding pins on buffer */
+	BufFlags		flags;			/* see bit definitions above */
+	pg_atomic_uint32		usage_count;	/* usage counter for clock sweep code */
+	pg_atomic_uint32		refcount;		/* # of backends holding pins on buffer */
 	int			wait_backend_pid;		/* backend PID of pin-count waiter */
 
 	slock_t		buf_hdr_lock;	/* protects the above fields */
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to