Hi all

Here's a patch I wrote a while ago to detect and report when a
LWLockAcquire() results in a simple self-deadlock due to the caller already
holding the LWLock.

To avoid affecting hot-path performance, it only fires the check on the
first iteration through the retry loops in LWLockAcquire() and
LWLockWaitForVar(), and just before we sleep, once the fast-path has been
missed.

I wrote an earlier version of this when I was chasing down some hairy
issues with background workers deadlocking on some exit paths because
ereport(ERROR) or elog(ERROR) calls fired when a LWLock was held would
cause a before_shmem_exit or on_shmem_exit cleanup function to deadlock
when it tried to acquire the same lock.

But it's an easy enough mistake to make and a seriously annoying one to
track down, so I figured I'd post it for consideration. Maybe someone else
will get some use out of it even if nobody likes the idea of merging it.

As written the check runs only for --enable-cassert builds or when
LOCK_DEBUG is defined.
From 0ec0beb294f4c5ed35dbb35260f53b069563638f Mon Sep 17 00:00:00 2001
From: Craig Ringer <craig.rin...@2ndquadrant.com>
Date: Thu, 19 Nov 2020 14:24:55 +0800
Subject: [PATCH] LWLock self-deadlock detection

On cassert builds, if we miss the LWLock acquire fast-path and have to
wait for a LWLock, check to make sure that the acquiring backend
doesn't already hold the lock.

This detects conditions that would otherwise cause a backend to
deadlock indefinitely with interrupts disabled. A backend in this
state won't respond to SIGTERM or SIGQUIT and will prevent postgres
from finishing shutdown.

While a LWLock self-deadlock is always the result of a programming
mistake, it's not always easy to forsee all possible locking states
especially when dealing with exception handling and with exit
callbacks.
---
 src/backend/storage/lmgr/lwlock.c | 87 +++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)

diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index 2fa90cc095..a2d6dec557 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -313,6 +313,85 @@ LOG_LWDEBUG(const char *where, LWLock *lock, const char *msg)
 #define LOG_LWDEBUG(a,b,c) ((void)0)
 #endif							/* LOCK_DEBUG */
 
+#if defined(USE_ASSERT_CHECKING) || defined(LOCK_DEBUG)
+#ifndef LWLOCK_SELF_DEADLOCK_DETECTION
+#define LWLOCK_SELF_DEADLOCK_DETECTION
+#endif
+#endif
+
+#ifdef LWLOCK_SELF_DEADLOCK_DETECTION
+static void
+LWLockReportSelfDeadlock(LWLock *lock, LWLockMode mode, int held_lockno)
+{
+		StringInfoData locklist;
+		int i;
+
+		LOG_LWDEBUG("LWLockReportSelfDeadlock", lock, "acquire self-deadlock detected");
+
+		initStringInfo(&locklist);
+		for (i=0; i < num_held_lwlocks; i++) {
+			appendStringInfo(&locklist, "%s (%p) excl=%d",
+							T_NAME(held_lwlocks[i].lock),
+							held_lwlocks[i].lock,
+							held_lwlocks[i].mode == LW_EXCLUSIVE);
+			if (i > 0)
+				appendStringInfoString(&locklist, ", ");
+		}
+
+
+		/*
+		 * It might be safe to bail out here on a non-cassert build but
+		 * more care is needed before anything like that is enabled.
+		 * LWLockAcquire() doesn't know if it's safe to `elog(FATAL)`
+		 * out from the current callsite. So we PANIC. This introduces some
+		 * risk of buggy code causing a server crash/restart cycle where
+		 * it would've previously just deadlocked a single backend, but
+		 * that's part of why we only enable this for assert builds.
+		 */
+		ereport(PANIC,
+				(errmsg_internal("self-deadlock detected on acquire of lwlock %s (%p) for mode %s: lock already held by this backend in mode %s",
+								 T_NAME(lock), lock,
+								 mode == LW_EXCLUSIVE
+										? "LW_EXCLUSIVE" : "LW_SHARED",
+								 held_lwlocks[held_lockno].mode == LW_EXCLUSIVE
+										? "LW_EXCLUSIVE" : "LW_SHARED"),
+				 errdetail("backend already holds %d lwlocks: %s",
+						   num_held_lwlocks, locklist.data)));
+}
+
+/*
+ * A bug in code that uses this particular LWLock could cause
+ * the lock-holder could be ourselves, in which case we'll
+ * self-deadlock forever as an unkillable process.
+ *
+ * Call this in any LWLock acquire retry loop once the fast-path
+ * acquire has failed.
+ *
+ * held_lockno should be initialized to 0 outside the wait loop
+ * so that this check only runs on the first iteration of any
+ * wait/retry loop.
+ */
+inline static void
+LWLockCheckSelfDeadlock(LWLock *lock, LWLockMode mode, int * const held_lockno)
+{
+	for ( ; *held_lockno < num_held_lwlocks; (*held_lockno)++) {
+		if (unlikely(held_lwlocks[*held_lockno].lock == lock)) {
+			LWLockReportSelfDeadlock(lock, mode, *held_lockno);
+		}
+	}
+}
+#else
+/*
+ * Making LWLockCheckSelfDeadlock an empty static inline instead of a macro
+ * silences compiler whinging about held_lockno being unused while optimising
+ * away without fuss.
+ */
+inline static void
+LWLockCheckSelfDeadlock(LWLock *lock, LWLockMode mode, int * const held_lockno)
+{
+}
+#endif
+
 #ifdef LWLOCK_STATS
 
 static void init_lwlock_stats(void);
@@ -1210,6 +1289,7 @@ LWLockAcquire(LWLock *lock, LWLockMode mode)
 	PGPROC	   *proc = MyProc;
 	bool		result = true;
 	int			extraWaits = 0;
+	int			held_lockno = 0; /* for self-deadlock detection */
 #ifdef LWLOCK_STATS
 	lwlock_stats *lwstats;
 
@@ -1322,6 +1402,9 @@ LWLockAcquire(LWLock *lock, LWLockMode mode)
 		lwstats->block_count++;
 #endif
 
+		/* Detect if we're trying to acquire our own lock */
+		LWLockCheckSelfDeadlock(lock, mode, &held_lockno);
+
 		LWLockReportWaitStart(lock);
 		TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), mode);
 
@@ -1621,6 +1704,7 @@ LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval)
 	PGPROC	   *proc = MyProc;
 	int			extraWaits = 0;
 	bool		result = false;
+	int			held_lockno = 0; /* for self-deadlock detection */
 #ifdef LWLOCK_STATS
 	lwlock_stats *lwstats;
 
@@ -1699,6 +1783,9 @@ LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval)
 		lwstats->block_count++;
 #endif
 
+		/* Detect if we're trying to wait on our own lock */
+		LWLockCheckSelfDeadlock(lock, LW_EXCLUSIVE, &held_lockno);
+
 		LWLockReportWaitStart(lock);
 		TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), LW_EXCLUSIVE);
 
-- 
2.26.2

Reply via email to