Initialise the MADVISE_AUTORESET notifier infrastructure for fault-mode
VMs and tear it down during VM close.

VM close drops vm->lock around xe_vm_madvise_fini(), since the madvise
worker takes vm->lock while draining pending unmap work. The close path
then retakes vm->lock before xe_svm_fini().

The MADVISE ioctl collects CPU mirror VMA ranges under vm->lock and
registers interval notifiers after dropping it. Registration is
best-effort because mmu_interval_notifier_insert() may take mmap_lock.

If registration fails, disable MADV_AUTORESET for the exact VMA range but
keep cpu_autoreset_active set, since the VMA is still CPU-only until the
first successful GPU fault.

Also skip SVM PTE zapping for CPU mirror VMAs while cpu_autoreset_active
is set, since no GPU PTEs should exist yet.

v2:
  - Register notifiers outside vm->lock; take vm->lock only for dedup and
    mtree_store_range. (Matt)
  - Collect VMA ranges under vm->lock and register notifiers after
    unlock. (Matt)

v3:
  - Allocate notifier ranges before mutating VMA state.
  - Fix notifier-lock unwind when the madvise type is invalid.
  - On notifier registration failure, clear MADV_AUTORESET but keep
    cpu_autoreset_active.

Cc: Matthew Brost <[email protected]>
Cc: Thomas Hellström <[email protected]>
Cc: Himal Prasad Ghimiray <[email protected]>
Signed-off-by: Arvind Yadav <[email protected]>
---
 drivers/gpu/drm/xe/xe_svm.c        |   9 +++
 drivers/gpu/drm/xe/xe_vm.c         |  14 ++++
 drivers/gpu/drm/xe/xe_vm_madvise.c | 105 ++++++++++++++++++++++++++++-
 3 files changed, 125 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c
index b58857668d48..4169cfae7b51 100644
--- a/drivers/gpu/drm/xe/xe_svm.c
+++ b/drivers/gpu/drm/xe/xe_svm.c
@@ -914,6 +914,15 @@ int xe_svm_init(struct xe_vm *vm)
                        drm_pagemap_release_owner(&vm->svm.peer);
                        return err;
                }
+
+               /* Initialize after gpusvm. */
+               err = xe_vm_madvise_init(vm);
+               if (err) {
+                       drm_gpusvm_fini(&vm->svm.gpusvm);
+                       xe_svm_put_pagemaps(vm);
+                       drm_pagemap_release_owner(&vm->svm.peer);
+                       return err;
+               }
        } else {
                err = drm_gpusvm_init(&vm->svm.gpusvm, "Xe SVM (simple)",
                                      &vm->xe->drm, NULL, 0, 0, 0, NULL,
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index dec5279f08a2..e93d715d3da9 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -1815,6 +1815,8 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 
flags, struct xe_file *xef)
 err_svm_fini:
        if (flags & XE_VM_FLAG_FAULT_MODE) {
                vm->size = 0; /* close the vm */
+               /* Safe if madvise init did not run. */
+               xe_vm_madvise_fini(vm);
                xe_svm_fini(vm);
        }
 err_no_resv:
@@ -1959,6 +1961,18 @@ void xe_vm_close_and_put(struct xe_vm *vm)
                xe_vma_destroy_unlocked(vma);
        }
 
+       /*
+        * The VM is closed here, so no new ioctl should enter. Drop vm->lock
+        * while draining madvise workers because they take vm->lock.
+        */
+       xe_assert(vm->xe, xe_vm_is_closed(vm));
+       up_write(&vm->lock);
+
+       if (vm->flags & XE_VM_FLAG_FAULT_MODE)
+               xe_vm_madvise_fini(vm);
+
+       down_write(&vm->lock);
+
        xe_svm_fini(vm);
 
        up_write(&vm->lock);
diff --git a/drivers/gpu/drm/xe/xe_vm_madvise.c 
b/drivers/gpu/drm/xe/xe_vm_madvise.c
index 6c42ce8e3f52..eb2d17abf9d4 100644
--- a/drivers/gpu/drm/xe/xe_vm_madvise.c
+++ b/drivers/gpu/drm/xe/xe_vm_madvise.c
@@ -28,6 +28,7 @@ struct xe_vmas_in_madvise_range {
        int num_vmas;
        bool has_bo_vmas;
        bool has_svm_userptr_vmas;
+       bool has_cpu_addr_mirror_vmas;
 };
 
 /**
@@ -70,7 +71,11 @@ static int get_vmas(struct xe_vm *vm, struct 
xe_vmas_in_madvise_range *madvise_r
 
                if (xe_vma_bo(vma))
                        madvise_range->has_bo_vmas = true;
-               else if (xe_vma_is_cpu_addr_mirror(vma) || 
xe_vma_is_userptr(vma))
+               else if (xe_vma_is_cpu_addr_mirror(vma)) {
+                       /* CPU mirror VMAs use the SVM notifier lock. */
+                       madvise_range->has_svm_userptr_vmas = true;
+                       madvise_range->has_cpu_addr_mirror_vmas = true;
+               } else if (xe_vma_is_userptr(vma))
                        madvise_range->has_svm_userptr_vmas = true;
 
                if (madvise_range->num_vmas == max_vmas) {
@@ -297,7 +302,12 @@ static u8 xe_zap_ptes_in_madvise_range(struct xe_vm *vm, 
u64 start, u64 end)
                        continue;
 
                if (xe_vma_is_cpu_addr_mirror(vma)) {
-                       tile_mask |= xe_svm_ranges_zap_ptes_in_range(vm,
+                       /*
+                        * CPU-only mirror VMAs have no GPU PTEs yet.
+                        * Once GPU-touched, SVM zap applies.
+                        */
+                       if (!xe_vma_has_cpu_autoreset_active(vma))
+                               tile_mask |= xe_svm_ranges_zap_ptes_in_range(vm,
                                                                      
xe_vma_start(vma),
                                                                      
xe_vma_end(vma));
                } else {
@@ -559,6 +569,32 @@ static bool check_bo_args_are_sane(struct xe_vm *vm, 
struct xe_vma **vmas,
        }
        return true;
 }
+
+/*
+ * Clear AUTORESET on CPU mirror VMAs in [start, end). Registration may
+ * fail after the VMA was split or merged, so do not rely on exact match.
+ */
+static void xe_vm_madvise_clear_autoreset_range(struct xe_vm *vm,
+                                               u64 start, u64 end)
+{
+       u64 addr = start;
+
+       lockdep_assert_held_write(&vm->lock);
+
+       while (addr < end) {
+               struct xe_vma *vma;
+
+               vma = xe_vm_find_overlapping_vma(vm, addr, end - addr);
+               if (!vma)
+                       break;
+
+               if (xe_vma_is_cpu_addr_mirror(vma))
+                       vma->gpuva.flags &= ~XE_VMA_MADV_AUTORESET;
+
+               addr = xe_vma_end(vma);
+       }
+}
+
 /**
  * xe_vm_madvise_ioctl - Handle MADVise ioctl for a VM
  * @dev: DRM device pointer
@@ -590,6 +626,11 @@ int xe_vm_madvise_ioctl(struct drm_device *dev, void 
*data, struct drm_file *fil
        struct drm_exec exec;
        int err, attr_type;
        bool do_retained;
+       struct {
+               u64 start;
+               u64 end;
+       } *notifier_ranges = NULL;
+       int num_notifier_ranges = 0;
 
        vm = xe_vm_lookup(xef, args->vm_id);
        if (XE_IOCTL_DBG(xe, !vm))
@@ -661,6 +702,20 @@ int xe_vm_madvise_ioctl(struct drm_device *dev, void 
*data, struct drm_file *fil
                }
        }
 
+       /*
+        * Allocate before taking BO dma-resv locks. GFP_KERNEL may enter
+        * reclaim, which can reach TTM shrinkers and dma-resv locks.
+        */
+       if (madvise_range.has_cpu_addr_mirror_vmas) {
+               notifier_ranges = kvmalloc_array(madvise_range.num_vmas,
+                                                sizeof(*notifier_ranges),
+                                                GFP_KERNEL);
+               if (!notifier_ranges) {
+                       err = -ENOMEM;
+                       goto free_vmas;
+               }
+       }
+
        if (madvise_range.has_bo_vmas) {
                if (args->type == DRM_XE_MEM_RANGE_ATTR_ATOMIC) {
                        if (!check_bo_args_are_sane(vm, madvise_range.vmas,
@@ -708,7 +763,7 @@ int xe_vm_madvise_ioctl(struct drm_device *dev, void *data, 
struct drm_file *fil
        /* Ensure the madvise function exists for this type */
        if (!madvise_funcs[attr_type]) {
                err = -EINVAL;
-               goto err_fini;
+               goto err_unlock_notifier;
        }
 
        madvise_funcs[attr_type](xe, vm, madvise_range.vmas, 
madvise_range.num_vmas, args,
@@ -717,9 +772,29 @@ int xe_vm_madvise_ioctl(struct drm_device *dev, void 
*data, struct drm_file *fil
        err = xe_vm_invalidate_madvise_range(vm, madvise_range.addr,
                                             madvise_range.addr + args->range);
 
+err_unlock_notifier:
        if (madvise_range.has_svm_userptr_vmas)
                xe_svm_notifier_unlock(vm);
 
+       if (err)
+               goto err_fini;
+
+       if (madvise_range.has_cpu_addr_mirror_vmas) {
+               for (int i = 0; i < madvise_range.num_vmas; i++) {
+                       struct xe_vma *vma = madvise_range.vmas[i];
+
+                       if (!xe_vma_is_cpu_addr_mirror(vma))
+                               continue;
+                       if (!(vma->gpuva.flags & XE_VMA_MADV_AUTORESET))
+                               continue;
+                       if (!xe_vma_has_cpu_autoreset_active(vma))
+                               continue;
+                       notifier_ranges[num_notifier_ranges].start = 
xe_vma_start(vma);
+                       notifier_ranges[num_notifier_ranges].end   = 
xe_vma_end(vma);
+                       num_notifier_ranges++;
+               }
+       }
+
 err_fini:
        if (madvise_range.has_bo_vmas)
                drm_exec_fini(&exec);
@@ -731,6 +806,30 @@ int xe_vm_madvise_ioctl(struct drm_device *dev, void 
*data, struct drm_file *fil
 unlock_vm:
        up_write(&vm->lock);
 
+       if (!err) {
+               for (int i = 0; i < num_notifier_ranges; i++) {
+                       int ret = xe_vm_madvise_register_notifier_range(vm,
+                                                       
notifier_ranges[i].start,
+                                                       notifier_ranges[i].end);
+                       if (ret) {
+                               drm_warn(&vm->xe->drm,
+                                        "Failed to register madvise notifier 
[%#llx-%#llx]: %d\n",
+                                        notifier_ranges[i].start,
+                                        notifier_ranges[i].end, ret);
+                               /*
+                                * Disable AUTORESET, but keep 
cpu_autoreset_active.
+                                * The VMA is still CPU-only.
+                                */
+                               down_write(&vm->lock);
+                               xe_vm_madvise_clear_autoreset_range(vm,
+                                                                   
notifier_ranges[i].start,
+                                                                   
notifier_ranges[i].end);
+                               up_write(&vm->lock);
+                       }
+               }
+       }
+       kvfree(notifier_ranges);
+
        /* Write retained value to user after releasing all locks */
        if (!err && do_retained)
                err = xe_madvise_purgeable_retained_to_user(&details);
-- 
2.43.0

Reply via email to