Introduce an rw-semaphore to serialize migration to device if
it's likely that migration races with another device migration
of the same CPU address space range.
This is a temporary fix to attempt to mitigate a livelock that
might happen if many devices try to migrate a range at the same
time, and it affects only devices using the xe driver.
A longer term fix is probably improvements in the core mm
migration layer.

Suggested-by: Matthew Brost <[email protected]>
Signed-off-by: Thomas Hellström <[email protected]>
Reviewed-by: Matthew Brost <[email protected]>
---
 drivers/gpu/drm/xe/xe_svm.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c
index 84ff99aa3e49..fa2ee2c08f31 100644
--- a/drivers/gpu/drm/xe/xe_svm.c
+++ b/drivers/gpu/drm/xe/xe_svm.c
@@ -1593,10 +1593,12 @@ struct drm_pagemap *xe_vma_resolve_pagemap(struct 
xe_vma *vma, struct xe_tile *t
 int xe_svm_alloc_vram(struct xe_svm_range *range, const struct drm_gpusvm_ctx 
*ctx,
                      struct drm_pagemap *dpagemap)
 {
+       static DECLARE_RWSEM(driver_migrate_lock);
        struct xe_vm *vm = range_to_vm(&range->base);
        enum drm_gpusvm_scan_result migration_state;
        struct xe_device *xe = vm->xe;
        int err, retries = 1;
+       bool write_locked = false;
 
        xe_assert(range_to_vm(&range->base)->xe, 
range->base.pages.flags.migrate_devmem);
        range_debug(range, "ALLOCATE VRAM");
@@ -1615,16 +1617,32 @@ int xe_svm_alloc_vram(struct xe_svm_range *range, const 
struct drm_gpusvm_ctx *c
                drm_dbg(&xe->drm, "Request migration to device memory on 
\"%s\".\n",
                        dpagemap->drm->unique);
 
+       err = down_read_interruptible(&driver_migrate_lock);
+       if (err)
+               return err;
        do {
                err = drm_pagemap_populate_mm(dpagemap, 
xe_svm_range_start(range),
                                              xe_svm_range_end(range),
                                              range->base.gpusvm->mm,
                                              ctx->timeslice_ms);
 
-               if (err == -EBUSY && retries)
-                       drm_gpusvm_range_evict(range->base.gpusvm, 
&range->base);
+               if (err == -EBUSY && retries) {
+                       if (!write_locked) {
+                               int lock_err;
 
+                               up_read(&driver_migrate_lock);
+                               lock_err = 
down_write_killable(&driver_migrate_lock);
+                               if (lock_err)
+                                       return lock_err;
+                               write_locked = true;
+                       }
+                       drm_gpusvm_range_evict(range->base.gpusvm, 
&range->base);
+               }
        } while (err == -EBUSY && retries--);
+       if (write_locked)
+               up_write(&driver_migrate_lock);
+       else
+               up_read(&driver_migrate_lock);
 
        return err;
 }
-- 
2.51.1

Reply via email to