With PROVE_LOCKING on an Snapdragon X1 and VM reclaim pressure, we see:

"""
kswapd0/121 is trying to acquire lock:
ffff800080ed3800 (reservation_ww_class_acquire){+.+.}-{0:0}, at:
  msm_gem_shrinker_scan (drivers/gpu/drm/msm/msm_gem_shrinker.c:189)

but task is already holding lock:
ffffbf4ddb44ca40 (fs_reclaim){+.+.}-{0:0}, at:
  balance_pgdat (mm/vmscan.c:7236 (discriminator 2))

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #2 (fs_reclaim){+.+.}-{0:0}:
lock_acquire (kernel/locking/lockdep.c:5868 kernel/locking/lockdep.c:5825)
fs_reclaim_acquire (mm/page_alloc.c:4325 mm/page_alloc.c:4339)
dma_resv_lockdep (drivers/dma-buf/dma-resv.c:798)
do_one_initcall (init/main.c:1392)
kernel_init_freeable (init/main.c:1454 (discriminator 1) init/main.c:1470
  (discriminator 1) init/main.c:1490 (discriminator 1) init/main.c:1703
  (discriminator 1))
kernel_init (init/main.c:1593)
ret_from_fork (arch/arm64/kernel/entry.S:858)

-> #1 (reservation_ww_class_mutex){+.+.}-{4:4}:
lock_acquire (kernel/locking/lockdep.c:5868 kernel/locking/lockdep.c:5825)
dma_resv_lockdep (./include/linux/ww_mutex.h:164 (discriminator 1)
  drivers/dma-buf/dma-resv.c:791 (discriminator 1))
do_one_initcall (init/main.c:1392)
kernel_init_freeable (init/main.c:1454 (discriminator 1) init/main.c:1470
  (discriminator 1) init/main.c:1490 (discriminator 1) init/main.c:1703
  (discriminator 1))
kernel_init (init/main.c:1593)
ret_from_fork (arch/arm64/kernel/entry.S:858)

-> #0 (reservation_ww_class_acquire){+.+.}-{0:0}:
check_prev_add (kernel/locking/lockdep.c:3165)
__lock_acquire (kernel/locking/lockdep.c:3284
  kernel/locking/lockdep.c:3908 kernel/locking/lockdep.c:5237)
lock_acquire (kernel/locking/lockdep.c:5868 kernel/locking/lockdep.c:5825)
drm_gem_lru_scan (./include/linux/ww_mutex.h:163 (discriminator 1)
  drivers/gpu/drm/drm_gem.c:1681 (discriminator 1))
msm_gem_shrinker_scan (drivers/gpu/drm/msm/msm_gem_shrinker.c:189)
do_shrink_slab (mm/shrinker.c:436)
shrink_slab (mm/shrinker.c:667)
shrink_one (mm/vmscan.c:5071)
shrink_node (mm/vmscan.c:5132 mm/vmscan.c:5210 mm/vmscan.c:6198)
balance_pgdat (mm/vmscan.c:7052 mm/vmscan.c:7228)
kswapd (mm/vmscan.c:7501)
kthread (kernel/kthread.c:436)
ret_from_fork (arch/arm64/kernel/entry.S:858)

other info that might help us debug this:

Chain exists of:
reservation_ww_class_acquire --> reservation_ww_class_mutex --> fs_reclaim
"""

kswapd0 holding fs_reclaim calls the MSM shrinker, which calls
dma_resv_lock. This in turn acquires fs_reclaim.

Fix this deadlock by using dma_resv_trylock() instead, dropping the
subsequently unused passed wait-wound lock 'ticket'.

Cc: [email protected]
Signed-off-by: Daniel J Blueman <[email protected]>
Fixes: fe4952b5f27c ("drm/msm: Convert vm locking")
---
 drivers/gpu/drm/msm/msm_gem_shrinker.c | 34 ++++++++++----------------
 1 file changed, 13 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_gem_shrinker.c 
b/drivers/gpu/drm/msm/msm_gem_shrinker.c
index 31fa51a44f86..5320ef57dd90 100644
--- a/drivers/gpu/drm/msm/msm_gem_shrinker.c
+++ b/drivers/gpu/drm/msm/msm_gem_shrinker.c
@@ -43,8 +43,7 @@ msm_gem_shrinker_count(struct shrinker *shrinker, struct 
shrink_control *sc)
 }
 
 static bool
-with_vm_locks(struct ww_acquire_ctx *ticket,
-             void (*fn)(struct drm_gem_object *obj),
+with_vm_locks(void (*fn)(struct drm_gem_object *obj),
              struct drm_gem_object *obj)
 {
        /*
@@ -52,7 +51,7 @@ with_vm_locks(struct ww_acquire_ctx *ticket,
         * success paths
         */
        struct drm_gpuvm_bo *vm_bo, *last_locked = NULL;
-       int ret = 0;
+       bool locked = true;
 
        drm_gem_for_each_gpuvm_bo (vm_bo, obj) {
                struct dma_resv *resv = drm_gpuvm_resv(vm_bo->vm);
@@ -60,23 +59,14 @@ with_vm_locks(struct ww_acquire_ctx *ticket,
                if (resv == obj->resv)
                        continue;
 
-               ret = dma_resv_lock(resv, ticket);
-
                /*
-                * Since we already skip the case when the VM and obj
-                * share a resv (ie. _NO_SHARE objs), we don't expect
-                * to hit a double-locking scenario... which the lock
-                * unwinding cannot really cope with.
+                * dma_resv_lock can't be used due to acquiring 'ticket' before 
the
+                * fs_reclaim lock, which is held in shrinker context
                 */
-               WARN_ON(ret == -EALREADY);
-
-               /*
-                * Don't bother with slow-lock / backoff / retry sequence,
-                * if we can't get the lock just give up and move on to
-                * the next object.
-                */
-               if (ret)
+               if (!dma_resv_trylock(resv)) {
+                       locked = false;
                        goto out_unlock;
+               }
 
                /*
                 * Hold a ref to prevent the vm_bo from being freed
@@ -108,7 +98,7 @@ with_vm_locks(struct ww_acquire_ctx *ticket,
                }
        }
 
-       return ret == 0;
+       return locked;
 }
 
 static bool
@@ -120,7 +110,7 @@ purge(struct drm_gem_object *obj, struct ww_acquire_ctx 
*ticket)
        if (msm_gem_active(obj))
                return false;
 
-       return with_vm_locks(ticket, msm_gem_purge, obj);
+       return with_vm_locks(msm_gem_purge, obj);
 }
 
 static bool
@@ -164,7 +154,6 @@ static unsigned long
 msm_gem_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc)
 {
        struct msm_drm_private *priv = shrinker->private_data;
-       struct ww_acquire_ctx ticket;
        struct {
                struct drm_gem_lru *lru;
                bool (*shrink)(struct drm_gem_object *obj, struct 
ww_acquire_ctx *ticket);
@@ -185,11 +174,14 @@ msm_gem_shrinker_scan(struct shrinker *shrinker, struct 
shrink_control *sc)
        for (unsigned i = 0; (nr > 0) && (i < ARRAY_SIZE(stages)); i++) {
                if (!stages[i].cond)
                        continue;
+               /*
+                * 'ticket' not needed on trylock paths
+                */
                stages[i].freed =
                        drm_gem_lru_scan(stages[i].lru, nr,
                                         &stages[i].remaining,
                                         stages[i].shrink,
-                                        &ticket);
+                                        NULL);
                nr -= stages[i].freed;
                freed += stages[i].freed;
                remaining += stages[i].remaining;
-- 
2.53.0

Reply via email to