From: Vitaly Prosyak <[email protected]>
Add lockdep annotations to teach lockdep the correct lock hierarchy
and catch ordering violations during development. This follows the
pattern established by dma-resv in drivers/dma-buf/dma-resv.c.
Lock ordering hierarchy (outermost to innermost):
1. userq_sch_mutex - Global userq scheduler (enforce_isolation)
2. userq_mutex - Per-context userq (held across queue create/destroy)
3. notifier_lock - MMU notifier synchronization
4. vram_lock - VRAM memory allocator
5. reset_domain->sem - GPU reset synchronization
6. reset_lock - Reset control mutex
7. srbm_mutex - SRBM register access
8. grbm_idx_mutex - GRBM index register access
9. mmio_idx_lock - MMIO index access (spinlock)
The implementation provides:
- Lock ordering training at module init (amdgpu_lockdep_init)
- Lock class association for real driver locks (amdgpu_lockdep_set_class)
Dummy locks are associated with the same class keys as real driver locks
via lockdep_set_class(), ensuring lockdep connects the training ordering
with actual runtime locks.
Testing:
Build the kernel with CONFIG_PROVE_LOCKING=y (enables CONFIG_LOCKDEP):
scripts/config --enable PROVE_LOCKING
scripts/config --enable DEBUG_LOCKDEP
make -j$(nproc)
On boot, dmesg should show:
AMDGPU: Lockdep annotations initialized (9 lock levels)
The companion IGT test (tests/amdgpu/amd_lockdep) exercises lock-heavy
GPU code paths concurrently to trigger lockdep warnings on violations:
sudo ./build/tests/amdgpu/amd_lockdep
sudo dmesg | grep -A 50 "circular locking dependency"
IGT subtests:
concurrent-reset-and-submit - reset_sem vs submission locks
concurrent-mmap-and-evict - mmap_lock vs vram_lock
concurrent-userptr-and-reset - notifier_lock vs reset_sem
stress-all-paths - all of the above simultaneously
A clean dmesg (no "circular locking dependency" or "possible recursive
locking detected" messages) confirms no lock ordering violations.
For CI integration, the test should be run on kernels compiled with
CONFIG_LOCKDEP=y; dmesg is scanned post-run for lockdep splats.
v2: (Christian)
- Move notifier_lock and vram_lock before reset locks in hierarchy.
HMM invalidation holds notifier_lock and can wait for GPU reset
completion, so notifier_lock must be outer to reset_domain->sem.
- Associate dummy locks with lock class keys via lockdep_set_class()
so lockdep connects training with real driver locks.
- Update commit message to list all 9 lock levels.
Requires CONFIG_PROVE_LOCKING=y to activate.
Cc: Christian Konig <[email protected]>
Cc: Alex Deucher <[email protected]>
Signed-off-by: Vitaly Prosyak <[email protected]>
---
drivers/gpu/drm/amd/amdgpu/Makefile | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 +
drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.c | 195 ++++++++++++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.h | 39 ++++
6 files changed, 242 insertions(+), 1 deletion(-)
create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.c
create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.h
diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile
b/drivers/gpu/drm/amd/amdgpu/Makefile
index ee3574797bc2..ba80542ead9d 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -69,7 +69,7 @@ amdgpu-y += amdgpu_device.o amdgpu_reg_access.o
amdgpu_doorbell_mgr.o amdgpu_kms
amdgpu_vm_sdma.o amdgpu_discovery.o amdgpu_ras_eeprom.o amdgpu_nbio.o \
amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \
amdgpu_fw_attestation.o amdgpu_securedisplay.o \
- amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o \
+ amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o
amdgpu_lockdep.o \
amdgpu_ring_mux.o amdgpu_xcp.o amdgpu_seq64.o amdgpu_aca.o
amdgpu_dev_coredump.o \
amdgpu_cper.o amdgpu_userq_fence.o amdgpu_eviction_fence.o amdgpu_ip.o
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 5d7bfa59424a..7b09410d6d8f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -105,6 +105,7 @@
#include "amdgpu_mca.h"
#include "amdgpu_aca.h"
#include "amdgpu_ras.h"
+#include "amdgpu_lockdep.h"
#include "amdgpu_cper.h"
#include "amdgpu_xcp.h"
#include "amdgpu_seq64.h"
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 5ccbe6c885cf..159882f5fdab 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3739,6 +3739,9 @@ int amdgpu_device_init(struct amdgpu_device *adev,
mutex_init(&adev->pm.stable_pstate_ctx_lock);
mutex_init(&adev->benchmark_mutex);
mutex_init(&adev->gfx.reset_sem_mutex);
+
+ /* Associate locks with lockdep classes for ordering validation */
+ amdgpu_lockdep_set_class(adev);
/* Initialize the mutex for cleaner shader isolation between GFX and
compute processes */
mutex_init(&adev->enforce_isolation_mutex);
for (i = 0; i < MAX_XCP; ++i) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 1781c0c3d010..bf4260269681 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -3158,6 +3158,9 @@ static int __init amdgpu_init(void)
{
int r;
+ /* Train lockdep on correct lock ordering */
+ amdgpu_lockdep_init();
+
r = amdgpu_sync_init();
if (r)
return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.c
new file mode 100644
index 000000000000..d5d71fd7c70d
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.c
@@ -0,0 +1,195 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright 2024 Advanced Micro Devices, Inc.
+ *
+ * Lockdep annotation for AMDGPU lock ordering
+ *
+ * This module teaches lockdep the correct lock ordering to catch
+ * potential deadlocks at development time rather than runtime.
+ *
+ * Based on dma-resv lockdep approach from:
+ * drivers/dma-buf/dma-resv.c:dma_resv_lockdep()
+ */
+
+#include "amdgpu.h"
+#include "amdgpu_reset.h"
+
+#ifdef CONFIG_LOCKDEP
+
+/* Lock class keys for associating with real driver locks */
+static struct lock_class_key amdgpu_userq_sch_mutex_key;
+static struct lock_class_key amdgpu_userq_mutex_key;
+static struct lock_class_key amdgpu_notifier_lock_key;
+static struct lock_class_key amdgpu_vram_lock_key;
+static struct lock_class_key amdgpu_reset_sem_key;
+static struct lock_class_key amdgpu_reset_lock_key;
+static struct lock_class_key amdgpu_srbm_lock_key;
+static struct lock_class_key amdgpu_grbm_lock_key;
+static struct lock_class_key amdgpu_mmio_lock_key;
+
+/**
+ * amdgpu_lockdep_set_class - Associate lock class keys with real locks
+ * @adev: AMDGPU device
+ *
+ * Call during device init to associate lock classes with actual locks
+ * so lockdep can track them properly.
+ */
+void amdgpu_lockdep_set_class(struct amdgpu_device *adev)
+{
+ lockdep_set_class(&adev->gfx.userq_sch_mutex,
+ &amdgpu_userq_sch_mutex_key);
+ lockdep_set_class(&adev->notifier_lock, &amdgpu_notifier_lock_key);
+ lockdep_set_class(&adev->srbm_mutex, &amdgpu_srbm_lock_key);
+ lockdep_set_class(&adev->grbm_idx_mutex, &amdgpu_grbm_lock_key);
+ lockdep_set_class(&adev->mmio_idx_lock, &amdgpu_mmio_lock_key);
+
+ if (adev->reset_domain)
+ lockdep_set_class(&adev->reset_domain->sem,
+ &amdgpu_reset_sem_key);
+}
+
+/**
+ * amdgpu_lockdep_init - Teach lockdep the correct lock ordering
+ *
+ * Instantiates dummy objects and takes locks in the correct order to
+ * train lockdep. This helps catch lock ordering violations during
+ * development.
+ *
+ * Lock ordering hierarchy (outermost to innermost):
+ *
+ * 1. userq_sch_mutex - Global userq scheduler (enforce_isolation)
+ * 2. userq_mutex - Per-context userq (held across queue
create/destroy)
+ * 3. notifier_lock - MMU notifier lock
+ * 4. vram_lock - VRAM allocator lock
+ * 5. reset_domain->sem - GPU reset synchronization
+ * 6. reset_lock - Reset control lock
+ * 7. srbm_mutex - SRBM register access
+ * 8. grbm_idx_mutex - GRBM index access
+ * 9. mmio_idx_lock - MMIO index access (spinlock)
+ *
+ * Evidence:
+ * - userq_sch_mutex -> userq_mutex: amdgpu_gfx_kfd_sch_ctrl() calls
+ * amdgpu_userq_stop_sched_for_enforce_isolation() which takes userq_mutex
+ * - userq_mutex -> notifier_lock: userq paths may trigger MMU notifier
+ * invalidation which acquires notifier_lock
+ * - notifier_lock -> reset_domain->sem: HMM invalidation callback holds
+ * notifier_lock and can wait for GPU reset completion, so notifier_lock
+ * must be outer to reset_domain->sem
+ * - vram_lock -> reset_domain->sem: VRAM management paths may need to
+ * wait for ongoing reset to complete
+ *
+ * Note: mmap_lock ordering relative to GPU locks is already taught
+ * by dma-resv (drivers/dma-buf/dma-resv.c).
+ */
+int amdgpu_lockdep_init(void)
+{
+ struct amdgpu_reset_domain *reset_domain = NULL;
+ struct amdgpu_reset_control reset_ctl;
+ struct mutex userq_sch_mutex;
+ struct mutex userq_mutex;
+ struct mutex notifier_lock;
+ struct mutex vram_lock;
+ struct mutex srbm_mutex;
+ struct mutex grbm_idx_mutex;
+ spinlock_t mmio_idx_lock;
+ unsigned long flags;
+
+ /*
+ * Initialize dummy reset domain
+ */
+ reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE,
+ "lockdep_test");
+ if (!reset_domain)
+ return -ENOMEM;
+
+ /* Initialize dummy locks */
+ mutex_init(&userq_sch_mutex);
+ mutex_init(&userq_mutex);
+ mutex_init(¬ifier_lock);
+ mutex_init(&vram_lock);
+ mutex_init(&reset_ctl.reset_lock);
+ mutex_init(&srbm_mutex);
+ mutex_init(&grbm_idx_mutex);
+ spin_lock_init(&mmio_idx_lock);
+
+ /*
+ * Associate dummy locks with the same class keys used for real
+ * driver locks. This ensures lockdep connects the ordering learned
+ * here with the actual locks used at runtime.
+ */
+ lockdep_set_class(&userq_sch_mutex, &amdgpu_userq_sch_mutex_key);
+ lockdep_set_class(&userq_mutex, &amdgpu_userq_mutex_key);
+ lockdep_set_class(¬ifier_lock, &amdgpu_notifier_lock_key);
+ lockdep_set_class(&vram_lock, &amdgpu_vram_lock_key);
+ lockdep_set_class(&reset_domain->sem, &amdgpu_reset_sem_key);
+ lockdep_set_class(&reset_ctl.reset_lock, &amdgpu_reset_lock_key);
+ lockdep_set_class(&srbm_mutex, &amdgpu_srbm_lock_key);
+ lockdep_set_class(&grbm_idx_mutex, &amdgpu_grbm_lock_key);
+ lockdep_set_class(&mmio_idx_lock, &amdgpu_mmio_lock_key);
+
+ /*
+ * Take locks in the correct order to train lockdep.
+ * This establishes the dependency chain.
+ */
+
+ /* Level 1: Global userq scheduler mutex (outermost) */
+ mutex_lock(&userq_sch_mutex);
+
+ /* Level 2: Per-context userq mutex */
+ mutex_lock(&userq_mutex);
+
+ /* Level 3: MMU notifier lock */
+ mutex_lock(¬ifier_lock);
+
+ /* Level 4: VRAM allocator lock */
+ mutex_lock(&vram_lock);
+
+ /* Level 5: Reset domain semaphore */
+ down_read(&reset_domain->sem);
+
+ /* Level 6: Reset control lock */
+ mutex_lock(&reset_ctl.reset_lock);
+
+ /*
+ * Mark potential memory reclaim boundary.
+ * GPU operations might trigger memory allocation/reclaim.
+ */
+ fs_reclaim_acquire(GFP_KERNEL);
+
+ /* Level 7: SRBM register access */
+ mutex_lock(&srbm_mutex);
+
+ /* Level 8: GRBM index access */
+ mutex_lock(&grbm_idx_mutex);
+
+ /* Level 9: MMIO index access (innermost lock, spinlock) */
+ spin_lock_irqsave(&mmio_idx_lock, flags);
+
+ /*
+ * All locks acquired in order.
+ * Lockdep has now learned the valid dependency chain.
+ */
+
+ /* Release in reverse order */
+ spin_unlock_irqrestore(&mmio_idx_lock, flags);
+ mutex_unlock(&grbm_idx_mutex);
+ mutex_unlock(&srbm_mutex);
+
+ fs_reclaim_release(GFP_KERNEL);
+
+ mutex_unlock(&reset_ctl.reset_lock);
+ up_read(&reset_domain->sem);
+ mutex_unlock(&vram_lock);
+ mutex_unlock(¬ifier_lock);
+ mutex_unlock(&userq_mutex);
+ mutex_unlock(&userq_sch_mutex);
+
+ /* Cleanup */
+ amdgpu_reset_put_reset_domain(reset_domain);
+
+ pr_info("AMDGPU: Lockdep annotations initialized (9 lock levels)\n");
+
+ return 0;
+}
+
+#endif /* CONFIG_LOCKDEP */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.h
new file mode 100644
index 000000000000..04adb58665bf
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright 2024 Advanced Micro Devices, Inc.
+ *
+ * Lockdep annotation interface for AMDGPU
+ */
+
+#ifndef __AMDGPU_LOCKDEP_H__
+#define __AMDGPU_LOCKDEP_H__
+
+#include <linux/lockdep.h>
+
+struct amdgpu_device;
+
+#ifdef CONFIG_LOCKDEP
+
+/**
+ * amdgpu_lockdep_init - Train lockdep on correct lock ordering
+ *
+ * Call once during module init to establish the lock dependency chain.
+ */
+int amdgpu_lockdep_init(void);
+
+/**
+ * amdgpu_lockdep_set_class - Associate lock class keys with real locks
+ * @adev: AMDGPU device
+ *
+ * Call during device init to associate lock classes with actual locks.
+ */
+void amdgpu_lockdep_set_class(struct amdgpu_device *adev);
+
+#else /* !CONFIG_LOCKDEP */
+
+static inline int amdgpu_lockdep_init(void) { return 0; }
+static inline void amdgpu_lockdep_set_class(struct amdgpu_device *adev) {}
+
+#endif /* CONFIG_LOCKDEP */
+
+#endif /* __AMDGPU_LOCKDEP_H__ */
--
2.54.0