Hello hackers!
Continuing the theme:
http://www.postgresql.org/message-id/3368228.mTSz6V0Jsq@dinodell
This time, we fairly rewrote 'refcount' and 'usage_count' to atomic in
PinBuffer and UnpinBuffer (but save lock for buffer flags in Unpin).
In the same time it doesn't affect to correctness of buffer manager
because that variables already have LWLock on top of them (for partition of
hashtable). If someone pinned buffer after the call StrategyGetBuffer we just
try again (in BufferAlloc). Also in the code there is one more check before
deleting the old buffer, where changes can be rolled back. The other functions
where it is checked 'refcount' and 'usage_count' put exclusive locks.
Also stress test with 256 KB shared memory ended successfully.
Without patch we have 417523 TPS and with patch 965821 TPS for big x86 server.
All details here: https://gist.github.com/stalkerg/773a81b79a27b4d5d63f
Thank you.
--
YUriy Zhuravlev
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index 6622d22..50ca2a5 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -33,14 +33,14 @@ typedef struct
BlockNumber blocknum;
bool isvalid;
bool isdirty;
- uint16 usagecount;
+ uint32 usagecount;
/*
* An int32 is sufficiently large, as MAX_BACKENDS prevents a buffer from
* being pinned by too many backends and each backend will only pin once
* because of bufmgr.c's PrivateRefCount infrastructure.
*/
- int32 pinning_backends;
+ uint32 pinning_backends;
} BufferCachePagesRec;
@@ -160,8 +160,8 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
fctx->record[i].reldatabase = bufHdr->tag.rnode.dbNode;
fctx->record[i].forknum = bufHdr->tag.forkNum;
fctx->record[i].blocknum = bufHdr->tag.blockNum;
- fctx->record[i].usagecount = bufHdr->usage_count;
- fctx->record[i].pinning_backends = bufHdr->refcount;
+ fctx->record[i].usagecount = pg_atomic_read_u32(&bufHdr->usage_count);
+ fctx->record[i].pinning_backends = pg_atomic_read_u32(&bufHdr->refcount);
if (bufHdr->flags & BM_DIRTY)
fctx->record[i].isdirty = true;
@@ -236,7 +236,7 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
values[7] = Int16GetDatum(fctx->record[i].usagecount);
nulls[7] = false;
/* unused for v1.0 callers, but the array is always long enough */
- values[8] = Int32GetDatum(fctx->record[i].pinning_backends);
+ values[8] = UInt32GetDatum(fctx->record[i].pinning_backends);
nulls[8] = false;
}
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index 3ae2848..e139a7c 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -96,8 +96,8 @@ InitBufferPool(void)
CLEAR_BUFFERTAG(buf->tag);
buf->flags = 0;
- buf->usage_count = 0;
- buf->refcount = 0;
+ pg_atomic_init_u32(&buf->usage_count, 0);
+ pg_atomic_init_u32(&buf->refcount, 0);
buf->wait_backend_pid = 0;
SpinLockInit(&buf->buf_hdr_lock);
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 8c0358e..afba360 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -962,7 +962,6 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
* into the buffer.
*/
buf = GetBufferDescriptor(buf_id);
-
valid = PinBuffer(buf, strategy);
/* Can release the mapping lock as soon as we've pinned it */
@@ -1013,7 +1012,15 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
*/
buf = StrategyGetBuffer(strategy);
- Assert(buf->refcount == 0);
+ /*
+ * Ok, we can skip this but then we have to remove new buffer from
+ * hash table. Better to just try again.
+ */
+ if (pg_atomic_read_u32(&buf->refcount) != 0)
+ {
+ UnlockBufHdr(buf);
+ continue;
+ }
/* Must copy buffer flags while we still hold the spinlock */
oldFlags = buf->flags;
@@ -1211,7 +1218,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
* over with a new victim buffer.
*/
oldFlags = buf->flags;
- if (buf->refcount == 1 && !(oldFlags & BM_DIRTY))
+ if (pg_atomic_read_u32(&buf->refcount) == 1 && !(oldFlags & BM_DIRTY))
break;
UnlockBufHdr(buf);
@@ -1234,10 +1241,10 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
buf->tag = newTag;
buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT);
if (relpersistence == RELPERSISTENCE_PERMANENT)
- buf->flags |= BM_TAG_VALID | BM_PERMANENT;
+ buf->flags|= BM_TAG_VALID | BM_PERMANENT;
else
buf->flags |= BM_TAG_VALID;
- buf->usage_count = 1;
+ pg_atomic_write_u32(&buf->usage_count, 1);
UnlockBufHdr(buf);
@@ -1329,7 +1336,7 @@ retry:
* yet done StartBufferIO, WaitIO will fall through and we'll effectively
* be busy-looping here.)
*/
- if (buf->refcount != 0)
+ if (pg_atomic_read_u32(&buf->refcount) != 0)
{
UnlockBufHdr(buf);
LWLockRelease(oldPartitionLock);
@@ -1347,7 +1354,7 @@ retry:
oldFlags = buf->flags;
CLEAR_BUFFERTAG(buf->tag);
buf->flags = 0;
- buf->usage_count = 0;
+ pg_atomic_write_u32(&buf->usage_count, 0);
UnlockBufHdr(buf);
@@ -1399,7 +1406,7 @@ MarkBufferDirty(Buffer buffer)
LockBufHdr(bufHdr);
- Assert(bufHdr->refcount > 0);
+ Assert(pg_atomic_read_u32(&bufHdr->refcount) > 0);
/*
* If the buffer was not dirty already, do vacuum accounting.
@@ -1498,20 +1505,23 @@ PinBuffer(volatile BufferDesc *buf, BufferAccessStrategy strategy)
ReservePrivateRefCountEntry();
ref = NewPrivateRefCountEntry(b);
- LockBufHdr(buf);
- buf->refcount++;
+ pg_atomic_add_fetch_u32(&buf->refcount, 1);
+
if (strategy == NULL)
{
- if (buf->usage_count < BM_MAX_USAGE_COUNT)
- buf->usage_count++;
+ uint32 expect = pg_atomic_read_u32(&buf->usage_count);
+ while (expect < BM_MAX_USAGE_COUNT)
+ {
+ if (pg_atomic_compare_exchange_u32(&buf->usage_count, &expect, expect+1))
+ break;
+ }
}
else
{
- if (buf->usage_count == 0)
- buf->usage_count = 1;
+ uint32 expect = 0;
+ pg_atomic_compare_exchange_u32(&buf->usage_count, &expect, 1);
}
result = (buf->flags & BM_VALID) != 0;
- UnlockBufHdr(buf);
}
else
{
@@ -1558,7 +1568,7 @@ PinBuffer_Locked(volatile BufferDesc *buf)
*/
Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
- buf->refcount++;
+ pg_atomic_add_fetch_u32(&buf->refcount, 1);
UnlockBufHdr(buf);
b = BufferDescriptorGetBuffer(buf);
@@ -1598,15 +1608,14 @@ UnpinBuffer(volatile BufferDesc *buf, bool fixOwner)
Assert(!LWLockHeldByMe(buf->content_lock));
Assert(!LWLockHeldByMe(buf->io_in_progress_lock));
- LockBufHdr(buf);
-
/* Decrement the shared reference count */
- Assert(buf->refcount > 0);
- buf->refcount--;
+ Assert(pg_atomic_read_u32(&buf->refcount) > 0);
+ pg_atomic_sub_fetch_u32(&buf->refcount, 1);
+ LockBufHdr(buf);
/* Support LockBufferForCleanup() */
if ((buf->flags & BM_PIN_COUNT_WAITER) &&
- buf->refcount == 1)
+ pg_atomic_read_u32(&buf->refcount) == 1)
{
/* we just released the last pin other than the waiter's */
int wait_backend_pid = buf->wait_backend_pid;
@@ -2095,7 +2104,7 @@ SyncOneBuffer(int buf_id, bool skip_recently_used)
*/
LockBufHdr(bufHdr);
- if (bufHdr->refcount == 0 && bufHdr->usage_count == 0)
+ if (pg_atomic_read_u32(&bufHdr->refcount) == 0 && pg_atomic_read_u32(&bufHdr->usage_count) == 0)
result |= BUF_REUSABLE;
else if (skip_recently_used)
{
@@ -2278,7 +2287,7 @@ PrintBufferLeakWarning(Buffer buffer)
"(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
buffer, path,
buf->tag.blockNum, buf->flags,
- buf->refcount, loccount);
+ pg_atomic_read_u32(&buf->refcount), loccount);
pfree(path);
}
@@ -2809,7 +2818,7 @@ PrintBufferDescs(void)
i, buf->freeNext,
relpathbackend(buf->tag.rnode, InvalidBackendId, buf->tag.forkNum),
buf->tag.blockNum, buf->flags,
- buf->refcount, GetPrivateRefCount(b));
+ pg_atomic_read_u32(&buf->refcount), GetPrivateRefCount(b));
}
}
#endif
@@ -2834,7 +2843,7 @@ PrintPinnedBufs(void)
i, buf->freeNext,
relpathperm(buf->tag.rnode, buf->tag.forkNum),
buf->tag.blockNum, buf->flags,
- buf->refcount, GetPrivateRefCount(b));
+ pg_atomic_read_u32(&buf->refcount), GetPrivateRefCount(b));
}
}
}
@@ -3149,7 +3158,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
}
LockBufHdr(bufHdr);
- Assert(bufHdr->refcount > 0);
+ Assert(pg_atomic_read_u32(&bufHdr->refcount) > 0);
if (!(bufHdr->flags & BM_DIRTY))
{
dirtied = true; /* Means "will be dirtied by this action" */
@@ -3307,8 +3316,8 @@ LockBufferForCleanup(Buffer buffer)
/* Try to acquire lock */
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
LockBufHdr(bufHdr);
- Assert(bufHdr->refcount > 0);
- if (bufHdr->refcount == 1)
+ Assert(pg_atomic_read_u32(&bufHdr->refcount) > 0);
+ if (pg_atomic_read_u32(&bufHdr->refcount) == 1)
{
/* Successfully acquired exclusive lock with pincount 1 */
UnlockBufHdr(bufHdr);
@@ -3417,8 +3426,8 @@ ConditionalLockBufferForCleanup(Buffer buffer)
bufHdr = GetBufferDescriptor(buffer - 1);
LockBufHdr(bufHdr);
- Assert(bufHdr->refcount > 0);
- if (bufHdr->refcount == 1)
+ Assert(pg_atomic_read_u32(&bufHdr->refcount) > 0);
+ if (pg_atomic_read_u32(&bufHdr->refcount) == 1)
{
/* Successfully acquired exclusive lock with pincount 1 */
UnlockBufHdr(bufHdr);
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index bc2c773..4461271 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -280,14 +280,13 @@ StrategyGetBuffer(BufferAccessStrategy strategy)
* of 8.3, but we'd better check anyway.)
*/
LockBufHdr(buf);
- if (buf->refcount == 0 && buf->usage_count == 0)
+ if (pg_atomic_read_u32(&buf->refcount) == 0 && pg_atomic_read_u32(&buf->usage_count) == 0)
{
if (strategy != NULL)
AddBufferToRing(strategy, buf);
return buf;
}
UnlockBufHdr(buf);
-
}
}
@@ -303,11 +302,11 @@ StrategyGetBuffer(BufferAccessStrategy strategy)
* it; decrement the usage_count (unless pinned) and keep scanning.
*/
LockBufHdr(buf);
- if (buf->refcount == 0)
+ if (pg_atomic_read_u32(&buf->refcount) == 0)
{
- if (buf->usage_count > 0)
+ if (buf->usage_count.value > 0)
{
- buf->usage_count--;
+ buf->usage_count.value--;
trycounter = NBuffers;
}
else
@@ -617,7 +616,7 @@ GetBufferFromRing(BufferAccessStrategy strategy)
*/
buf = GetBufferDescriptor(bufnum - 1);
LockBufHdr(buf);
- if (buf->refcount == 0 && buf->usage_count <= 1)
+ if (pg_atomic_read_u32(&buf->refcount) == 0 && pg_atomic_read_u32(&buf->usage_count) <= 1)
{
strategy->current_was_in_ring = true;
return buf;
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index 3144afe..e1932f5 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -131,8 +131,8 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
/* this part is equivalent to PinBuffer for a shared buffer */
if (LocalRefCount[b] == 0)
{
- if (bufHdr->usage_count < BM_MAX_USAGE_COUNT)
- bufHdr->usage_count++;
+ if (pg_atomic_read_u32(&bufHdr->usage_count) < BM_MAX_USAGE_COUNT)
+ pg_atomic_add_fetch_u32(&bufHdr->usage_count, 1);
}
LocalRefCount[b]++;
ResourceOwnerRememberBuffer(CurrentResourceOwner,
@@ -169,9 +169,9 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
if (LocalRefCount[b] == 0)
{
- if (bufHdr->usage_count > 0)
+ if (pg_atomic_read_u32(&bufHdr->usage_count) > 0)
{
- bufHdr->usage_count--;
+ pg_atomic_fetch_sub_u32(&bufHdr->usage_count, 1);
trycounter = NLocBuffer;
}
else
@@ -252,7 +252,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
bufHdr->tag = newTag;
bufHdr->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
bufHdr->flags |= BM_TAG_VALID;
- bufHdr->usage_count = 1;
+ pg_atomic_write_u32(&bufHdr->usage_count, 1);
*foundPtr = FALSE;
return bufHdr;
@@ -328,7 +328,7 @@ DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum,
/* Mark buffer invalid */
CLEAR_BUFFERTAG(bufHdr->tag);
bufHdr->flags = 0;
- bufHdr->usage_count = 0;
+ pg_atomic_write_u32(&bufHdr->usage_count, 0);
}
}
}
@@ -368,7 +368,7 @@ DropRelFileNodeAllLocalBuffers(RelFileNode rnode)
/* Mark buffer invalid */
CLEAR_BUFFERTAG(bufHdr->tag);
bufHdr->flags = 0;
- bufHdr->usage_count = 0;
+ pg_atomic_write_u32(&bufHdr->usage_count, 0);
}
}
}
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 521ee1c..68cbbf4 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -137,9 +137,9 @@ typedef struct buftag
typedef struct BufferDesc
{
BufferTag tag; /* ID of page contained in buffer */
- BufFlags flags; /* see bit definitions above */
- uint16 usage_count; /* usage counter for clock sweep code */
- unsigned refcount; /* # of backends holding pins on buffer */
+ BufFlags flags; /* see bit definitions above */
+ pg_atomic_uint32 usage_count; /* usage counter for clock sweep code */
+ pg_atomic_uint32 refcount; /* # of backends holding pins on buffer */
int wait_backend_pid; /* backend PID of pin-count waiter */
slock_t buf_hdr_lock; /* protects the above fields */
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers