From: Honglei Huang <[email protected]>

Implement the core restore infrastructure and invalidation callback for
xnack-off SVM. The invalidate callback handles MMU notifier events by
quiescing compute queues when needed and scheduling restore work.

For MMU_NOTIFY_UNMAP events, avoid unconditionally quiescing compute
queues -- only quiesce when a range is partially unmapped and has active
GPU mappings (i.e. the GC worker will need a rebuild window). Full-range
unmaps and ranges without GPU mappings skip the quiesce entirely, which
reduces unnecessary queue preemption traffic.

Non-UNMAP events (CLEAR/MIGRATE) always quiesce because PTEs are cleared
synchronously and the restore is asynchronous.

Signed-off-by: Honglei Huang <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_userptr.c | 257 ++++++++++++++++++++
 1 file changed, 257 insertions(+)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_userptr.c

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userptr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_userptr.c
new file mode 100644
index 000000000..b231c7d44
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userptr.c
@@ -0,0 +1,257 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright 2026 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "amdgpu_svm.h"
+#include "amdgpu_userptr.h"
+#include "amdgpu_svm_range.h"
+#include "amdgpu_svm_attr.h"
+#include "amdgpu.h"
+#include "amdgpu_amdkfd.h"
+
+#include <drm/drm_gpusvm.h>
+
+#include <uapi/linux/kfd_ioctl.h>
+
+#define AMDGPU_SVM_RESTORE_WQ_NAME "amdgpu_svm_restore"
+
+static void svm_restore_enqueue_work(struct amdgpu_svm *svm,
+                                    struct amdgpu_svm_range *range,
+                                    unsigned long start_page,
+                                    unsigned long last_page,
+                                    uint8_t pending_ops);
+
+static void amdgpu_svm_restore_worker(struct work_struct *w);
+
+static void
+svm_restore_notifier_event_end(struct amdgpu_svm *svm,
+                  struct drm_gpusvm_range *range,
+                  const struct mmu_notifier_range *mmu_range)
+{
+       struct drm_gpusvm_ctx ctx = { .in_notifier = true, };
+       struct amdgpu_svm_range *svm_range = to_amdgpu_svm_range(range);
+       unsigned long start_page = max(drm_gpusvm_range_start(&svm_range->base),
+                                      mmu_range->start) >> PAGE_SHIFT;
+       unsigned long last_page = (min(drm_gpusvm_range_end(&svm_range->base),
+                                      mmu_range->end) >> PAGE_SHIFT) - 1;
+
+       amdgpu_svm_assert_in_notifier(svm);
+
+       drm_gpusvm_range_unmap_pages(&svm->gpusvm, range, &ctx);
+       if (mmu_range->event == MMU_NOTIFY_UNMAP) {
+               AMDGPU_SVM_RANGE_DEBUG(svm_range, "GARBAGE COLLECTOR ADD");
+               drm_gpusvm_range_set_unmapped(&svm_range->base, mmu_range);
+               svm_restore_enqueue_work(svm, svm_range, start_page, last_page,
+                            AMDGPU_SVM_RANGE_OP_UNMAP);
+       } else {
+               AMDGPU_SVM_RANGE_DEBUG(svm_range, "RESTORE WORKER ADD");
+               svm_restore_enqueue_work(svm, svm_range, start_page, last_page,
+                            AMDGPU_SVM_RANGE_OP_RESTORE);
+       }
+}
+
+static void svm_restore_eviction_begin(struct amdgpu_svm *svm)
+{
+       if (atomic_inc_return(&svm->restore.evicted_ranges) != 1)
+               return;
+
+       svm->restore.begin(svm);
+}
+
+static bool
+svm_restore_unmap_needs_quiesce(struct drm_gpusvm_range *range,
+                               const struct mmu_notifier_range *mmu_range)
+{
+       struct amdgpu_svm_range *svm_range = to_amdgpu_svm_range(range);
+
+       if (!READ_ONCE(svm_range->gpu_mapped))
+               return false;
+
+       return drm_gpusvm_range_start(range) < mmu_range->start ||
+              drm_gpusvm_range_end(range) > mmu_range->end;
+}
+
+void
+amdgpu_svm_restore_invalidate(struct amdgpu_svm *svm,
+                             struct drm_gpusvm_notifier *notifier,
+                             const struct mmu_notifier_range *mmu_range,
+                             struct drm_gpusvm_range *first,
+                             uint64_t adj_start, uint64_t adj_end)
+{
+       struct drm_gpusvm_range *r;
+       bool is_unmap = mmu_range->event == MMU_NOTIFY_UNMAP;
+       bool needs_flush = false;
+
+       if (is_unmap) {
+               bool quiesced = false;
+
+               amdgpu_svm_capture_checkpoint_ts(svm);
+
+               r = first;
+               drm_gpusvm_for_each_range(r, notifier, adj_start, adj_end) {
+                       if (!quiesced &&
+                           svm_restore_unmap_needs_quiesce(r, mmu_range)) {
+                               svm_restore_eviction_begin(svm);
+                               quiesced = true;
+                       }
+
+                       needs_flush |= amdgpu_svm_range_notifier_event_begin(
+                                                       svm, r, mmu_range);
+               }
+
+               if (needs_flush)
+                       svm->flush_tlb(svm);
+       } else {
+               svm_restore_eviction_begin(svm);
+
+               r = first;
+               drm_gpusvm_for_each_range(r, notifier, adj_start, adj_end)
+                       amdgpu_svm_range_invalidate_gpu_mapping(
+                                                       to_amdgpu_svm_range(r));
+       }
+
+       r = first;
+       drm_gpusvm_for_each_range(r, notifier, adj_start, adj_end)
+               svm_restore_notifier_event_end(svm, r, mmu_range);
+}
+
+static void
+svm_restore_enqueue_work(struct amdgpu_svm *svm,
+                        struct amdgpu_svm_range *range,
+                        unsigned long start_page,
+                        unsigned long last_page,
+                        uint8_t pending_ops)
+{
+       bool queue_gc_work = false;
+       bool queue_restore_work = false;
+
+       if (atomic_read(&svm->exiting))
+               return;
+
+       spin_lock(&svm->work_lock);
+
+       /* Deny any work if range is unmapped */
+       if (UNMAP_WORK(range->pending_ops)) {
+               spin_unlock(&svm->work_lock);
+               return;
+       }
+
+       range->pending_start_page = min(range->pending_start_page, start_page);
+       range->pending_last_page = max(range->pending_last_page, last_page);
+
+       if (UNMAP_WORK(pending_ops)) {
+               range->pending_ops = AMDGPU_SVM_RANGE_OP_UNMAP;
+
+               switch (range->queue_state) {
+               case AMDGPU_SVM_RANGE_NOT_QUEUED:
+                       drm_gpusvm_range_get(&range->base);
+                       list_add_tail(&range->work_node, &svm->gc.list);
+                       range->queue_state = AMDGPU_SVM_RANGE_IN_GC;
+                       queue_gc_work = true;
+                       break;
+               case AMDGPU_SVM_RANGE_IN_RESTORE:
+                       /* Not in processing so move to gc is safe. */
+                       list_move_tail(&range->work_node, &svm->gc.list);
+                       range->queue_state = AMDGPU_SVM_RANGE_IN_GC;
+                       queue_gc_work = true;
+                       break;
+               case AMDGPU_SVM_RANGE_PROCESSING:
+                       /* Do not move range into gc, because if the range is 
being
+                        * processed by the restore worker, moving it to GC may 
cause
+                        * use-after-free: the GC worker would remove the range 
while
+                        * the restore worker is still using it.
+                        * Owner worker will see UNMAP in pending_ops by
+                        * put_if_dequeued() and move to gc after processing the
+                        * restore work.
+                        */
+                       break;
+               case AMDGPU_SVM_RANGE_IN_GC:
+                       break;
+               }
+       } else {
+               switch (range->queue_state) {
+               case AMDGPU_SVM_RANGE_NOT_QUEUED:
+                       range->pending_ops |= AMDGPU_SVM_RANGE_OP_RESTORE;
+                       drm_gpusvm_range_get(&range->base);
+                       list_add_tail(&range->work_node, &svm->restore.list);
+                       range->queue_state = AMDGPU_SVM_RANGE_IN_RESTORE;
+                       queue_restore_work = true;
+                       break;
+               case AMDGPU_SVM_RANGE_IN_RESTORE:
+                       range->pending_ops |= AMDGPU_SVM_RANGE_OP_RESTORE;
+                       break;
+               case AMDGPU_SVM_RANGE_PROCESSING:
+                       /*
+                        * Owner worker will see RESTORE in pending_ops
+                        * via put_if_dequeued() and queue into restore worker.
+                        */
+                       range->pending_ops |= AMDGPU_SVM_RANGE_OP_RESTORE;
+                       break;
+               case AMDGPU_SVM_RANGE_IN_GC:
+                       /* Should not goto here cause UNMAP denies all the 
work. */
+                       break;
+               }
+       }
+
+       spin_unlock(&svm->work_lock);
+
+       if (queue_gc_work)
+               queue_work(svm->gc.wq, &svm->gc.work);
+       if (queue_restore_work)
+               queue_delayed_work(svm->restore.wq, &svm->restore.work,
+                                  
msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
+}
+
+int amdgpu_svm_restore_init(struct amdgpu_svm *svm,
+                           void (*begin)(struct amdgpu_svm *),
+                           void (*end)(struct amdgpu_svm *))
+{
+       svm->restore.wq = alloc_ordered_workqueue(AMDGPU_SVM_RESTORE_WQ_NAME,
+                                                 WQ_HIGHPRI | WQ_MEM_RECLAIM);
+       if (!svm->restore.wq)
+               return -ENOMEM;
+
+       svm->restore.begin = begin;
+       svm->restore.end = end;
+       atomic_set(&svm->restore.quiesced, 0);
+       atomic_set(&svm->restore.evicted_ranges, 0);
+       INIT_LIST_HEAD(&svm->restore.list);
+       INIT_DELAYED_WORK(&svm->restore.work, amdgpu_svm_restore_worker);
+       return 0;
+}
+
+void amdgpu_svm_restore_fini(struct amdgpu_svm *svm)
+{
+       cancel_delayed_work_sync(&svm->restore.work);
+       amdgpu_svm_clean_queue(svm, &svm->restore.list);
+       atomic_set(&svm->restore.evicted_ranges, 0);
+       if (atomic_read(&svm->restore.quiesced))
+               svm->restore.end(svm);
+       destroy_workqueue(svm->restore.wq);
+       svm->restore.wq = NULL;
+}
+
+void amdgpu_svm_restore_flush(struct amdgpu_svm *svm)
+{
+       flush_delayed_work(&svm->restore.work);
+}
-- 
2.34.1

Reply via email to