Hi, While finalizing some fixes in BRIN, I decided to stress-test the relevant part of the code to check if I missed something. Imagine a simple script that builds BRIN indexes on random data, does random changes and cross-checks the results with/without the index.
But instead of I almost immediately ran into a LWLock deadlock :-( I've managed to reproduce this on PG13+, but I believe it's there since the brinRevmapDesummarizeRange was introduced in PG10. I just haven't tried on pre-13 releases. The stress-test-2.sh script (attached .tgz) builds a table, fills it with random data and then runs a mix of updates and (de)summarization DDL of random fraction of the index. The lockup is usually triggered within a couple minutes, but might take longer (I guess it depends on parameters used to generate the random data, so it may take a couple runs to hit the right combination). The root cause is that brin_doupdate and brinRevmapDesummarizeRange end up locking buffers in different orders. Attached is also a .patch that adds a bunch of LOG messages for buffer locking in BRIN code (it's for PG13, but should work on newer releases too). Here's a fairly typical example of the interaction between brin_doupdate and brinRevmapDesummarizeRange: brin_doupdate (from UPDATE query): LOG: brin_doupdate: samepage 0 LOG: number of LWLocks held: 0 LOG: brin_getinsertbuffer: locking 898 lock 0x7f9a99a5af64 LOG: brin_getinsertbuffer: buffer locked LOG: brin_getinsertbuffer B: locking 899 lock 0x7f9a99a5afa4 LOG: brin_getinsertbuffer B: buffer locked LOG: number of LWLocks held: 2 LOG: lock 0 => 0x7f9a99a5af64 LOG: lock 1 => 0x7f9a99a5afa4 LOG: brin_doupdate: locking buffer for update LOG: brinLockRevmapPageForUpdate: locking 158 lock 0x7f9a99a4f664 brinRevmapDesummarizeRange (from brin_desummarize_range): LOG: starting brinRevmapDesummarizeRange LOG: number of LWLocks held: 0 LOG: brinLockRevmapPageForUpdate: locking 158 lock 0x7f9a99a4f664 LOG: brinLockRevmapPageForUpdate: buffer locked LOG: number of LWLocks held: 1 LOG: lock 0 => 0x7f9a99a4f664 LOG: brinRevmapDesummarizeRange: locking 898 lock 0x7f9a99a5af64 So, brin_doupdate starts with no LWLocks, and locks buffers 898, 899 (through getinsertbuffer). And then tries to lock 158. Meanwhile brinRevmapDesummarizeRange locks 158 first, and then tries to lock 898. So, a LWLock deadlock :-( I've now seen a bunch of these traces, with only minor differences. For example brinRevmapDesummarizeRange might gets stuck on the second buffer locked by getinsertbuffer (not the first one like here). I don't have a great idea how to fix this - I guess we need to ensure the buffers are locked in the same order, but that seems tricky. Obviously, people don't call brin_desummarize_range() very often, which likely explains the lack of reports. regards -- Tomas Vondra EnterpriseDB: http://www.enterprisedb.com The Enterprise PostgreSQL Company
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index 0becfde113..65e4ed76b4 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -689,7 +689,9 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo)
meta = ReadBuffer(index, P_NEW);
Assert(BufferGetBlockNumber(meta) == BRIN_METAPAGE_BLKNO);
+ elog(LOG, "brinbuild: locking buffer %d lock %p", meta, BufferDescriptorGetContentLock2(meta));
LockBuffer(meta, BUFFER_LOCK_EXCLUSIVE);
+ elog(LOG, "brinbuild: buffer locked");
brin_metapage_init(BufferGetPage(meta), BrinGetPagesPerRange(index),
BRIN_CURRENT_VERSION);
@@ -756,7 +758,9 @@ brinbuildempty(Relation index)
/* An empty BRIN index has a metapage only. */
metabuf =
ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL);
+ elog(LOG, "brinbuildempty: locking buffer %d lock %p", metabuf, BufferDescriptorGetContentLock2(metabuf));
LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
+ elog(LOG, "brinbuildempty: buffer locked");
/* Initialize and xlog metabuffer. */
START_CRIT_SECTION();
diff --git a/src/backend/access/brin/brin_pageops.c b/src/backend/access/brin/brin_pageops.c
index d5b19293a0..d4a118eb54 100644
--- a/src/backend/access/brin/brin_pageops.c
+++ b/src/backend/access/brin/brin_pageops.c
@@ -78,6 +78,10 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
return false; /* keep compiler quiet */
}
+ elog(LOG, "brin_doupdate: samepage %d", samepage);
+
+ DumpHeldLWLocks();
+
/* make sure the revmap is long enough to contain the entry we need */
brinRevmapExtend(revmap, heapBlk);
@@ -106,13 +110,18 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
}
else
{
+ elog(LOG, "brin_doupdate: locking buffer %d lock %p", oldbuf, BufferDescriptorGetContentLock2(oldbuf));
LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
+ elog(LOG, "brin_doupdate: buffer locked");
newbuf = InvalidBuffer;
extended = false;
}
oldpage = BufferGetPage(oldbuf);
oldlp = PageGetItemId(oldpage, oldoff);
+
+ DumpHeldLWLocks();
+
/*
* Check that the old tuple wasn't updated concurrently: it might have
* moved someplace else entirely, and for that matter the whole page
@@ -125,6 +134,7 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
oldoff > PageGetMaxOffsetNumber(oldpage) ||
!ItemIdIsNormal(oldlp))
{
+ elog(LOG, "brin_doupdate A: unlock %d new %d", oldbuf, newbuf);
LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
/*
@@ -151,6 +161,7 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
*/
if (!brin_tuples_equal(oldtup, oldsz, origtup, origsz))
{
+ elog(LOG, "brin_doupdate B: unlock %d new %d", oldbuf, newbuf);
LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
if (BufferIsValid(newbuf))
{
@@ -223,6 +234,7 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
* Not enough space, but caller said that there was. Tell them to
* start over.
*/
+ elog(LOG, "brin_doupdate C: unlock %d", oldbuf);
LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
return false;
}
@@ -238,7 +250,9 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
OffsetNumber newoff;
Size freespace = 0;
+ elog(LOG, "brin_doupdate: locking buffer for update");
revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk);
+ elog(LOG, "brin_doupdate: buffer locked");
START_CRIT_SECTION();
@@ -303,6 +317,7 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
END_CRIT_SECTION();
+ elog(LOG, "brin_doupdate D: unlock %d %d %d", revmapbuf, oldbuf, newbuf);
LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK);
LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
UnlockReleaseBuffer(newbuf);
@@ -378,7 +393,9 @@ brin_doinsert(Relation idxrel, BlockNumber pagesPerRange,
* revmap over the page we held a pin on, so we cannot assume that
* it's still a regular page.
*/
+ elog(LOG, "brin_doinsert: locking buffer %d lock %p", *buffer, BufferDescriptorGetContentLock2(*buffer));
LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+ elog(LOG, "brin_doinsert: buffer locked");
if (br_page_get_freespace(BufferGetPage(*buffer)) < itemsz)
{
UnlockReleaseBuffer(*buffer);
@@ -641,7 +658,9 @@ brin_page_cleanup(Relation idxrel, Buffer buf)
LockRelationForExtension(idxrel, ShareLock);
UnlockRelationForExtension(idxrel, ShareLock);
+ elog(LOG, "brin_page_cleanup: locking buffer %d lock %p", buf, BufferDescriptorGetContentLock2(buf));
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+ elog(LOG, "brin_page_cleanup: buffer locked");
if (PageIsNew(page))
{
brin_initialize_empty_new_buffer(idxrel, buf);
@@ -764,7 +783,9 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
*/
if (BufferIsValid(oldbuf) && oldblk < newblk)
{
+ elog(LOG, "brin_getinsertbuffer: locking buffer %d lock %p", oldbuf, BufferDescriptorGetContentLock2(oldbuf));
LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
+ elog(LOG, "brin_getinsertbuffer: buffer locked");
if (!BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf)))
{
LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
@@ -796,7 +817,9 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
}
}
+ elog(LOG, "brin_getinsertbuffer B: locking buffer %d lock %p", buf, BufferDescriptorGetContentLock2(buf));
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+ elog(LOG, "brin_getinsertbuffer B: buffer locked");
if (extensionLockHeld)
UnlockRelationForExtension(irel, ExclusiveLock);
@@ -823,7 +846,9 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
*/
if (BufferIsValid(oldbuf) && oldblk > newblk)
{
+ elog(LOG, "brin_getinsertbuffer C: locking buffer %d lock %p", oldbuf, BufferDescriptorGetContentLock2(oldbuf));
LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
+ elog(LOG, "brin_getinsertbuffer: buffer locked");
Assert(BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf)));
}
diff --git a/src/backend/access/brin/brin_revmap.c b/src/backend/access/brin/brin_revmap.c
index 35746714a7..072d82514c 100644
--- a/src/backend/access/brin/brin_revmap.c
+++ b/src/backend/access/brin/brin_revmap.c
@@ -139,7 +139,9 @@ brinLockRevmapPageForUpdate(BrinRevmap *revmap, BlockNumber heapBlk)
Buffer rmBuf;
rmBuf = revmap_get_buffer(revmap, heapBlk);
+ elog(LOG, "brinLockRevmapPageForUpdate: locking buffer %d lock %p", rmBuf, BufferDescriptorGetContentLock2(rmBuf));
LockBuffer(rmBuf, BUFFER_LOCK_EXCLUSIVE);
+ elog(LOG, "brinLockRevmapPageForUpdate: buffer locked");
return rmBuf;
}
@@ -341,6 +343,9 @@ brinRevmapDesummarizeRange(Relation idxrel, BlockNumber heapBlk)
OffsetNumber regOffset;
ItemId lp;
+ elog(LOG, "starting brinRevmapDesummarizeRange");
+ DumpHeldLWLocks();
+
revmap = brinRevmapInitialize(idxrel, &pagesPerRange, NULL);
revmapBlk = revmap_get_blkno(revmap, heapBlk);
@@ -368,8 +373,12 @@ brinRevmapDesummarizeRange(Relation idxrel, BlockNumber heapBlk)
return true;
}
+ DumpHeldLWLocks();
+
regBuf = ReadBuffer(idxrel, ItemPointerGetBlockNumber(iptr));
+ elog(LOG, "brinRevmapDesummarizeRange: locking buffer %d lock %p", regBuf, BufferDescriptorGetContentLock2(regBuf));
LockBuffer(regBuf, BUFFER_LOCK_EXCLUSIVE);
+ elog(LOG, "brinRevmapDesummarizeRange: buffer locked");
regPg = BufferGetPage(regBuf);
/*
* We're only removing data, not reading it, so there's no need to
@@ -544,7 +553,9 @@ revmap_physical_extend(BrinRevmap *revmap)
* but note that we still need to grab the relation extension lock because
* another backend can extend the index with regular BRIN pages.
*/
+ elog(LOG, "revmap_physical_extend: locking buffer %d lock %p", revmap->rm_metaBuf, BufferDescriptorGetContentLock2(revmap->rm_metaBuf));
LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_EXCLUSIVE);
+ elog(LOG, "revmap_physical_extend: buffer locked");
metapage = BufferGetPage(revmap->rm_metaBuf);
metadata = (BrinMetaPageData *) PageGetContents(metapage);
@@ -564,7 +575,9 @@ revmap_physical_extend(BrinRevmap *revmap)
if (mapBlk < nblocks)
{
buf = ReadBuffer(irel, mapBlk);
+ elog(LOG, "revmap_physical_extend B: locking buffer %d lock %p", buf, BufferDescriptorGetContentLock2(buf));
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+ elog(LOG, "revmap_physical_extend B: buffer locked");
page = BufferGetPage(buf);
}
else
@@ -587,7 +600,9 @@ revmap_physical_extend(BrinRevmap *revmap)
ReleaseBuffer(buf);
return;
}
+ elog(LOG, "revmap_physical_extend C: locking buffer %d lock %p", buf, BufferDescriptorGetContentLock2(buf));
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+ elog(LOG, "revmap_physical_extend C: buffer locked");
page = BufferGetPage(buf);
if (needLock)
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 69a1401654..0a9231f649 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -3740,6 +3740,16 @@ LockBuffer(Buffer buffer, int mode)
elog(ERROR, "unrecognized buffer lock mode: %d", mode);
}
+void *
+BufferDescriptorGetContentLock2(Buffer buffer)
+{
+ BufferDesc *buf;
+
+ buf = GetBufferDescriptor(buffer - 1);
+
+ return BufferDescriptorGetContentLock(buf);
+}
+
/*
* Acquire the content_lock for the buffer, but only if we don't have to wait.
*
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index 365d215f2b..8952226e1d 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -1984,3 +1984,11 @@ LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)
}
return false;
}
+
+void
+DumpHeldLWLocks(void)
+{
+ elog(LOG, "number of LWLocks held: %d", num_held_lwlocks);
+ for (int i = 0; i < num_held_lwlocks; i++)
+ elog(LOG, "lock %d => %p", i, held_lwlocks[i].lock);
+}
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index ee91b8fa26..d1325154ab 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -168,6 +168,8 @@ extern PGDLLIMPORT int32 *LocalRefCount;
*/
#define BufferGetPage(buffer) ((Page)BufferGetBlock(buffer))
+extern void *BufferDescriptorGetContentLock2(Buffer desc);
+
/*
* prototypes for functions in bufmgr.c
*/
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index cdbfbed118..bd34994013 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -187,6 +187,8 @@ extern int LWLockNewTrancheId(void);
extern void LWLockRegisterTranche(int tranche_id, const char *tranche_name);
extern void LWLockInitialize(LWLock *lock, int tranche_id);
+extern void DumpHeldLWLocks(void);
+
/*
* Every tranche ID less than NUM_INDIVIDUAL_LWLOCKS is reserved; also,
* we reserve additional tranche IDs for builtin tranches not included in
stress-test.tgz
Description: application/compressed-tar
