Hi all Here's a patch I wrote a while ago to detect and report when a LWLockAcquire() results in a simple self-deadlock due to the caller already holding the LWLock.
To avoid affecting hot-path performance, it only fires the check on the first iteration through the retry loops in LWLockAcquire() and LWLockWaitForVar(), and just before we sleep, once the fast-path has been missed. I wrote an earlier version of this when I was chasing down some hairy issues with background workers deadlocking on some exit paths because ereport(ERROR) or elog(ERROR) calls fired when a LWLock was held would cause a before_shmem_exit or on_shmem_exit cleanup function to deadlock when it tried to acquire the same lock. But it's an easy enough mistake to make and a seriously annoying one to track down, so I figured I'd post it for consideration. Maybe someone else will get some use out of it even if nobody likes the idea of merging it. As written the check runs only for --enable-cassert builds or when LOCK_DEBUG is defined.
From 0ec0beb294f4c5ed35dbb35260f53b069563638f Mon Sep 17 00:00:00 2001 From: Craig Ringer <craig.rin...@2ndquadrant.com> Date: Thu, 19 Nov 2020 14:24:55 +0800 Subject: [PATCH] LWLock self-deadlock detection On cassert builds, if we miss the LWLock acquire fast-path and have to wait for a LWLock, check to make sure that the acquiring backend doesn't already hold the lock. This detects conditions that would otherwise cause a backend to deadlock indefinitely with interrupts disabled. A backend in this state won't respond to SIGTERM or SIGQUIT and will prevent postgres from finishing shutdown. While a LWLock self-deadlock is always the result of a programming mistake, it's not always easy to forsee all possible locking states especially when dealing with exception handling and with exit callbacks. --- src/backend/storage/lmgr/lwlock.c | 87 +++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index 2fa90cc095..a2d6dec557 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -313,6 +313,85 @@ LOG_LWDEBUG(const char *where, LWLock *lock, const char *msg) #define LOG_LWDEBUG(a,b,c) ((void)0) #endif /* LOCK_DEBUG */ +#if defined(USE_ASSERT_CHECKING) || defined(LOCK_DEBUG) +#ifndef LWLOCK_SELF_DEADLOCK_DETECTION +#define LWLOCK_SELF_DEADLOCK_DETECTION +#endif +#endif + +#ifdef LWLOCK_SELF_DEADLOCK_DETECTION +static void +LWLockReportSelfDeadlock(LWLock *lock, LWLockMode mode, int held_lockno) +{ + StringInfoData locklist; + int i; + + LOG_LWDEBUG("LWLockReportSelfDeadlock", lock, "acquire self-deadlock detected"); + + initStringInfo(&locklist); + for (i=0; i < num_held_lwlocks; i++) { + appendStringInfo(&locklist, "%s (%p) excl=%d", + T_NAME(held_lwlocks[i].lock), + held_lwlocks[i].lock, + held_lwlocks[i].mode == LW_EXCLUSIVE); + if (i > 0) + appendStringInfoString(&locklist, ", "); + } + + + /* + * It might be safe to bail out here on a non-cassert build but + * more care is needed before anything like that is enabled. + * LWLockAcquire() doesn't know if it's safe to `elog(FATAL)` + * out from the current callsite. So we PANIC. This introduces some + * risk of buggy code causing a server crash/restart cycle where + * it would've previously just deadlocked a single backend, but + * that's part of why we only enable this for assert builds. + */ + ereport(PANIC, + (errmsg_internal("self-deadlock detected on acquire of lwlock %s (%p) for mode %s: lock already held by this backend in mode %s", + T_NAME(lock), lock, + mode == LW_EXCLUSIVE + ? "LW_EXCLUSIVE" : "LW_SHARED", + held_lwlocks[held_lockno].mode == LW_EXCLUSIVE + ? "LW_EXCLUSIVE" : "LW_SHARED"), + errdetail("backend already holds %d lwlocks: %s", + num_held_lwlocks, locklist.data))); +} + +/* + * A bug in code that uses this particular LWLock could cause + * the lock-holder could be ourselves, in which case we'll + * self-deadlock forever as an unkillable process. + * + * Call this in any LWLock acquire retry loop once the fast-path + * acquire has failed. + * + * held_lockno should be initialized to 0 outside the wait loop + * so that this check only runs on the first iteration of any + * wait/retry loop. + */ +inline static void +LWLockCheckSelfDeadlock(LWLock *lock, LWLockMode mode, int * const held_lockno) +{ + for ( ; *held_lockno < num_held_lwlocks; (*held_lockno)++) { + if (unlikely(held_lwlocks[*held_lockno].lock == lock)) { + LWLockReportSelfDeadlock(lock, mode, *held_lockno); + } + } +} +#else +/* + * Making LWLockCheckSelfDeadlock an empty static inline instead of a macro + * silences compiler whinging about held_lockno being unused while optimising + * away without fuss. + */ +inline static void +LWLockCheckSelfDeadlock(LWLock *lock, LWLockMode mode, int * const held_lockno) +{ +} +#endif + #ifdef LWLOCK_STATS static void init_lwlock_stats(void); @@ -1210,6 +1289,7 @@ LWLockAcquire(LWLock *lock, LWLockMode mode) PGPROC *proc = MyProc; bool result = true; int extraWaits = 0; + int held_lockno = 0; /* for self-deadlock detection */ #ifdef LWLOCK_STATS lwlock_stats *lwstats; @@ -1322,6 +1402,9 @@ LWLockAcquire(LWLock *lock, LWLockMode mode) lwstats->block_count++; #endif + /* Detect if we're trying to acquire our own lock */ + LWLockCheckSelfDeadlock(lock, mode, &held_lockno); + LWLockReportWaitStart(lock); TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), mode); @@ -1621,6 +1704,7 @@ LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval) PGPROC *proc = MyProc; int extraWaits = 0; bool result = false; + int held_lockno = 0; /* for self-deadlock detection */ #ifdef LWLOCK_STATS lwlock_stats *lwstats; @@ -1699,6 +1783,9 @@ LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval) lwstats->block_count++; #endif + /* Detect if we're trying to wait on our own lock */ + LWLockCheckSelfDeadlock(lock, LW_EXCLUSIVE, &held_lockno); + LWLockReportWaitStart(lock); TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), LW_EXCLUSIVE); -- 2.26.2