On 5/26/26 19:16, [email protected] wrote:
> From: Vitaly Prosyak <[email protected]>
> 
> Add lockdep annotations to teach lockdep the correct lock hierarchy
> and catch ordering violations during development. This follows the
> pattern established by dma-resv in drivers/dma-buf/dma-resv.c.
> 
> Lock ordering hierarchy (outermost to innermost):
> 
> 1. userq_sch_mutex   - Global userq scheduler (enforce_isolation)
> 2. userq_mutex       - Per-context userq (held across queue create/destroy)
> 3. notifier_lock     - MMU notifier synchronization
> 4. vram_lock         - VRAM memory allocator
> 5. reset_domain->sem - GPU reset synchronization
> 6. reset_lock        - Reset control mutex
> 7. srbm_mutex        - SRBM register access
> 8. grbm_idx_mutex    - GRBM index register access
> 9. mmio_idx_lock     - MMIO index access (spinlock)
> 
> The implementation provides:
> - Lock ordering training at module init (amdgpu_lockdep_init)
> - Lock class association for real driver locks (amdgpu_lockdep_set_class)
> 
> Dummy locks are associated with the same class keys as real driver locks
> via lockdep_set_class(), ensuring lockdep connects the training ordering
> with actual runtime locks.
> 
> Testing:
>   Build the kernel with CONFIG_PROVE_LOCKING=y (enables CONFIG_LOCKDEP):
>     scripts/config --enable PROVE_LOCKING
>     scripts/config --enable DEBUG_LOCKDEP
> 
>   On boot, dmesg should show:
>     AMDGPU: Lockdep annotations initialized (9 lock levels)
> 
>   The companion IGT test (tests/amdgpu/amd_lockdep) exercises lock-heavy
>   GPU code paths concurrently to trigger lockdep warnings on violations:
>     sudo ./build/tests/amdgpu/amd_lockdep
>     sudo dmesg | grep -A 50 "circular locking dependency"
> 
>   IGT subtests:
>     concurrent-reset-and-submit  - reset_sem vs submission locks
>     concurrent-mmap-and-evict    - mmap_lock vs vram_lock
>     concurrent-userptr-and-reset - notifier_lock vs reset_sem
>     stress-all-paths             - all of the above simultaneously
> 
>   A clean dmesg (no "circular locking dependency" or "possible recursive
>   locking detected" messages) confirms no lock ordering violations.
> 
>   For CI integration, the test should be run on kernels compiled with
>   CONFIG_LOCKDEP=y; dmesg is scanned post-run for lockdep splats.
> 
> v2: (Christian)
> - Move notifier_lock and vram_lock before reset locks in hierarchy.
>   HMM invalidation holds notifier_lock and can wait for GPU reset
>   completion, so notifier_lock must be outer to reset_domain->sem.
> - Associate dummy locks with lock class keys via lockdep_set_class()
>   so lockdep connects training with real driver locks.
> - Update commit message to list all 9 lock levels.
> 
> Requires CONFIG_PROVE_LOCKING=y to activate.
> 
> Cc: Christian Konig <[email protected]>
> Cc: Alex Deucher <[email protected]>
> Signed-off-by: Vitaly Prosyak <[email protected]>

Reviewed-by: Christian König <[email protected]>

> ---
>  drivers/gpu/drm/amd/amdgpu/Makefile         |   2 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h         |   1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |   3 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c     |   3 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.c | 195 ++++++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.h |  39 ++++
>  6 files changed, 242 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.c
>  create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.h
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
> b/drivers/gpu/drm/amd/amdgpu/Makefile
> index ee3574797bc2..ba80542ead9d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/Makefile
> +++ b/drivers/gpu/drm/amd/amdgpu/Makefile
> @@ -69,7 +69,7 @@ amdgpu-y += amdgpu_device.o amdgpu_reg_access.o 
> amdgpu_doorbell_mgr.o amdgpu_kms
>       amdgpu_vm_sdma.o amdgpu_discovery.o amdgpu_ras_eeprom.o amdgpu_nbio.o \
>       amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \
>       amdgpu_fw_attestation.o amdgpu_securedisplay.o \
> -     amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o \
> +     amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o 
> amdgpu_lockdep.o \
>       amdgpu_ring_mux.o amdgpu_xcp.o amdgpu_seq64.o amdgpu_aca.o 
> amdgpu_dev_coredump.o \
>       amdgpu_cper.o amdgpu_userq_fence.o amdgpu_eviction_fence.o amdgpu_ip.o
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 5d7bfa59424a..7b09410d6d8f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -105,6 +105,7 @@
>  #include "amdgpu_mca.h"
>  #include "amdgpu_aca.h"
>  #include "amdgpu_ras.h"
> +#include "amdgpu_lockdep.h"
>  #include "amdgpu_cper.h"
>  #include "amdgpu_xcp.h"
>  #include "amdgpu_seq64.h"
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 5ff224163bab..10e485ff055c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3752,6 +3752,9 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>       mutex_init(&adev->pm.stable_pstate_ctx_lock);
>       mutex_init(&adev->benchmark_mutex);
>       mutex_init(&adev->gfx.reset_sem_mutex);
> +
> +     /* Associate locks with lockdep classes for ordering validation */
> +     amdgpu_lockdep_set_class(adev);
>       /* Initialize the mutex for cleaner shader isolation between GFX and 
> compute processes */
>       mutex_init(&adev->enforce_isolation_mutex);
>       for (i = 0; i < MAX_XCP; ++i) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 1781c0c3d010..bf4260269681 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -3158,6 +3158,9 @@ static int __init amdgpu_init(void)
>  {
>       int r;
>  
> +     /* Train lockdep on correct lock ordering */
> +     amdgpu_lockdep_init();
> +
>       r = amdgpu_sync_init();
>       if (r)
>               return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.c
> new file mode 100644
> index 000000000000..d5d71fd7c70d
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.c
> @@ -0,0 +1,195 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright 2024 Advanced Micro Devices, Inc.
> + *
> + * Lockdep annotation for AMDGPU lock ordering
> + *
> + * This module teaches lockdep the correct lock ordering to catch
> + * potential deadlocks at development time rather than runtime.
> + *
> + * Based on dma-resv lockdep approach from:
> + * drivers/dma-buf/dma-resv.c:dma_resv_lockdep()
> + */
> +
> +#include "amdgpu.h"
> +#include "amdgpu_reset.h"
> +
> +#ifdef CONFIG_LOCKDEP
> +
> +/* Lock class keys for associating with real driver locks */
> +static struct lock_class_key amdgpu_userq_sch_mutex_key;
> +static struct lock_class_key amdgpu_userq_mutex_key;
> +static struct lock_class_key amdgpu_notifier_lock_key;
> +static struct lock_class_key amdgpu_vram_lock_key;
> +static struct lock_class_key amdgpu_reset_sem_key;
> +static struct lock_class_key amdgpu_reset_lock_key;
> +static struct lock_class_key amdgpu_srbm_lock_key;
> +static struct lock_class_key amdgpu_grbm_lock_key;
> +static struct lock_class_key amdgpu_mmio_lock_key;
> +
> +/**
> + * amdgpu_lockdep_set_class - Associate lock class keys with real locks
> + * @adev: AMDGPU device
> + *
> + * Call during device init to associate lock classes with actual locks
> + * so lockdep can track them properly.
> + */
> +void amdgpu_lockdep_set_class(struct amdgpu_device *adev)
> +{
> +     lockdep_set_class(&adev->gfx.userq_sch_mutex,
> +                       &amdgpu_userq_sch_mutex_key);
> +     lockdep_set_class(&adev->notifier_lock, &amdgpu_notifier_lock_key);
> +     lockdep_set_class(&adev->srbm_mutex, &amdgpu_srbm_lock_key);
> +     lockdep_set_class(&adev->grbm_idx_mutex, &amdgpu_grbm_lock_key);
> +     lockdep_set_class(&adev->mmio_idx_lock, &amdgpu_mmio_lock_key);
> +
> +     if (adev->reset_domain)
> +             lockdep_set_class(&adev->reset_domain->sem,
> +                               &amdgpu_reset_sem_key);
> +}
> +
> +/**
> + * amdgpu_lockdep_init - Teach lockdep the correct lock ordering
> + *
> + * Instantiates dummy objects and takes locks in the correct order to
> + * train lockdep. This helps catch lock ordering violations during
> + * development.
> + *
> + * Lock ordering hierarchy (outermost to innermost):
> + *
> + * 1. userq_sch_mutex     - Global userq scheduler (enforce_isolation)
> + * 2. userq_mutex         - Per-context userq (held across queue 
> create/destroy)
> + * 3. notifier_lock       - MMU notifier lock
> + * 4. vram_lock           - VRAM allocator lock
> + * 5. reset_domain->sem   - GPU reset synchronization
> + * 6. reset_lock          - Reset control lock
> + * 7. srbm_mutex          - SRBM register access
> + * 8. grbm_idx_mutex      - GRBM index access
> + * 9. mmio_idx_lock       - MMIO index access (spinlock)
> + *
> + * Evidence:
> + * - userq_sch_mutex -> userq_mutex: amdgpu_gfx_kfd_sch_ctrl() calls
> + *   amdgpu_userq_stop_sched_for_enforce_isolation() which takes userq_mutex
> + * - userq_mutex -> notifier_lock: userq paths may trigger MMU notifier
> + *   invalidation which acquires notifier_lock
> + * - notifier_lock -> reset_domain->sem: HMM invalidation callback holds
> + *   notifier_lock and can wait for GPU reset completion, so notifier_lock
> + *   must be outer to reset_domain->sem
> + * - vram_lock -> reset_domain->sem: VRAM management paths may need to
> + *   wait for ongoing reset to complete
> + *
> + * Note: mmap_lock ordering relative to GPU locks is already taught
> + * by dma-resv (drivers/dma-buf/dma-resv.c).
> + */
> +int amdgpu_lockdep_init(void)
> +{
> +     struct amdgpu_reset_domain *reset_domain = NULL;
> +     struct amdgpu_reset_control reset_ctl;
> +     struct mutex userq_sch_mutex;
> +     struct mutex userq_mutex;
> +     struct mutex notifier_lock;
> +     struct mutex vram_lock;
> +     struct mutex srbm_mutex;
> +     struct mutex grbm_idx_mutex;
> +     spinlock_t mmio_idx_lock;
> +     unsigned long flags;
> +
> +     /*
> +      * Initialize dummy reset domain
> +      */
> +     reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE,
> +                                                     "lockdep_test");
> +     if (!reset_domain)
> +             return -ENOMEM;
> +
> +     /* Initialize dummy locks */
> +     mutex_init(&userq_sch_mutex);
> +     mutex_init(&userq_mutex);
> +     mutex_init(&notifier_lock);
> +     mutex_init(&vram_lock);
> +     mutex_init(&reset_ctl.reset_lock);
> +     mutex_init(&srbm_mutex);
> +     mutex_init(&grbm_idx_mutex);
> +     spin_lock_init(&mmio_idx_lock);
> +
> +     /*
> +      * Associate dummy locks with the same class keys used for real
> +      * driver locks. This ensures lockdep connects the ordering learned
> +      * here with the actual locks used at runtime.
> +      */
> +     lockdep_set_class(&userq_sch_mutex, &amdgpu_userq_sch_mutex_key);
> +     lockdep_set_class(&userq_mutex, &amdgpu_userq_mutex_key);
> +     lockdep_set_class(&notifier_lock, &amdgpu_notifier_lock_key);
> +     lockdep_set_class(&vram_lock, &amdgpu_vram_lock_key);
> +     lockdep_set_class(&reset_domain->sem, &amdgpu_reset_sem_key);
> +     lockdep_set_class(&reset_ctl.reset_lock, &amdgpu_reset_lock_key);
> +     lockdep_set_class(&srbm_mutex, &amdgpu_srbm_lock_key);
> +     lockdep_set_class(&grbm_idx_mutex, &amdgpu_grbm_lock_key);
> +     lockdep_set_class(&mmio_idx_lock, &amdgpu_mmio_lock_key);
> +
> +     /*
> +      * Take locks in the correct order to train lockdep.
> +      * This establishes the dependency chain.
> +      */
> +
> +     /* Level 1: Global userq scheduler mutex (outermost) */
> +     mutex_lock(&userq_sch_mutex);
> +
> +     /* Level 2: Per-context userq mutex */
> +     mutex_lock(&userq_mutex);
> +
> +     /* Level 3: MMU notifier lock */
> +     mutex_lock(&notifier_lock);
> +
> +     /* Level 4: VRAM allocator lock */
> +     mutex_lock(&vram_lock);
> +
> +     /* Level 5: Reset domain semaphore */
> +     down_read(&reset_domain->sem);
> +
> +     /* Level 6: Reset control lock */
> +     mutex_lock(&reset_ctl.reset_lock);
> +
> +     /*
> +      * Mark potential memory reclaim boundary.
> +      * GPU operations might trigger memory allocation/reclaim.
> +      */
> +     fs_reclaim_acquire(GFP_KERNEL);
> +
> +     /* Level 7: SRBM register access */
> +     mutex_lock(&srbm_mutex);
> +
> +     /* Level 8: GRBM index access */
> +     mutex_lock(&grbm_idx_mutex);
> +
> +     /* Level 9: MMIO index access (innermost lock, spinlock) */
> +     spin_lock_irqsave(&mmio_idx_lock, flags);
> +
> +     /*
> +      * All locks acquired in order.
> +      * Lockdep has now learned the valid dependency chain.
> +      */
> +
> +     /* Release in reverse order */
> +     spin_unlock_irqrestore(&mmio_idx_lock, flags);
> +     mutex_unlock(&grbm_idx_mutex);
> +     mutex_unlock(&srbm_mutex);
> +
> +     fs_reclaim_release(GFP_KERNEL);
> +
> +     mutex_unlock(&reset_ctl.reset_lock);
> +     up_read(&reset_domain->sem);
> +     mutex_unlock(&vram_lock);
> +     mutex_unlock(&notifier_lock);
> +     mutex_unlock(&userq_mutex);
> +     mutex_unlock(&userq_sch_mutex);
> +
> +     /* Cleanup */
> +     amdgpu_reset_put_reset_domain(reset_domain);
> +
> +     pr_info("AMDGPU: Lockdep annotations initialized (9 lock levels)\n");
> +
> +     return 0;
> +}
> +
> +#endif /* CONFIG_LOCKDEP */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.h
> new file mode 100644
> index 000000000000..04adb58665bf
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_lockdep.h
> @@ -0,0 +1,39 @@
> +/* SPDX-License-Identifier: MIT */
> +/*
> + * Copyright 2024 Advanced Micro Devices, Inc.
> + *
> + * Lockdep annotation interface for AMDGPU
> + */
> +
> +#ifndef __AMDGPU_LOCKDEP_H__
> +#define __AMDGPU_LOCKDEP_H__
> +
> +#include <linux/lockdep.h>
> +
> +struct amdgpu_device;
> +
> +#ifdef CONFIG_LOCKDEP
> +
> +/**
> + * amdgpu_lockdep_init - Train lockdep on correct lock ordering
> + *
> + * Call once during module init to establish the lock dependency chain.
> + */
> +int amdgpu_lockdep_init(void);
> +
> +/**
> + * amdgpu_lockdep_set_class - Associate lock class keys with real locks
> + * @adev: AMDGPU device
> + *
> + * Call during device init to associate lock classes with actual locks.
> + */
> +void amdgpu_lockdep_set_class(struct amdgpu_device *adev);
> +
> +#else /* !CONFIG_LOCKDEP */
> +
> +static inline int amdgpu_lockdep_init(void) { return 0; }
> +static inline void amdgpu_lockdep_set_class(struct amdgpu_device *adev) {}
> +
> +#endif /* CONFIG_LOCKDEP */
> +
> +#endif /* __AMDGPU_LOCKDEP_H__ */

Reply via email to