amdgpu: implement xnack-off GC work function

Honglei Huang Tue, 19 May 2026 02:24:07 -0700

From: Honglei Huang <[email protected]>

Implement the garbage collection work function for xnack-off SVM ranges.
When a range is fully unmapped, the GC worker removes it. When a range
is partially unmapped, it removes the stale range and synchronously
rebuilds the surviving sub-region via svm_restore_map_interval().  The
partial-unmap path evicts VRAM-backed pages back to sysmem via
amdgpu_svm_range_evict() (the devmem-aware wrapper) before remove, so
live data is preserved.


In svm_restore_map_interval(), cache vma->vm_start / vma->vm_end into
local variables before dropping mmap_read_lock so that subsequent
find_vma() and loop condition checks use the cached values rather than
dereferencing a potentially stale vma pointer.  The per-iteration cursor
advance is simplified to "if (seg_last >= last_page) break; cursor =
seg_last + 1;" -- attr_get_bounds() returns the segment containing
cursor, so seg_start <= cursor and the explicit ULONG_MAX guard is
redundant.

Permanent (non-retryable) errors from amdgpu_svm_range_map_attrs() are
classified and skipped with a trace log rather than retried: -ENOENT,
-EFAULT and -EPERM come from hmm_range_fault() and reflect VMA state
that the worker cannot fix (no VMA, non-faultable mapping, RO VMA
written), -EINVAL reflects a sanity-check failure that will not change
on retry, and -EHWPOISON reflects physical page corruption.  All other
non-zero returns (e.g. -EBUSY, -EAGAIN, -ETIME, -ENOMEM) are treated
as transient and saved for the worker to retry.

The helper is named svm_restore_enqueue_unmapped() (rather than
"requeue") to reflect that it inserts ranges into the restore queue
from both the partial-unmap fallback path here and the attr-change
realignment path added later.

Signed-off-by: Honglei Huang <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_userptr.c | 197 ++++++++++++++++++++
 1 file changed, 197 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userptr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_userptr.c
index 89e8b687b..0b02008be 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userptr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userptr.c
@@ -369,6 +369,203 @@ static void amdgpu_svm_restore_worker(struct work_struct 
*w)
 
        queue_delayed_work(svm->restore.wq, &svm->restore.work, resched_delay);
 }
+
+/*
+ * The interval may straddle VMA holes regions with no VMA backing.
+ *
+ *   1. Walk mm's VMA tree with find_vma_intersection() and skip the
+ *      holes between VMAs entirely, map_attrs() would just return
+ *      -EFAULT on a hole.
+ *   2. For each VMA clipped chunk, call amdgpu_svm_range_map_attrs().
+ *      Errors that the worker has no way to recover from -ENOENT,
+ *      -EFAULT, -EPERM, -EINVAL, -EHWPOISON are classified as
+ *      permanent: trace and skip the chunk, then continue with the
+ *      next VMA so a single bad page/VMA does not abort the rebuild
+ *      of the remaining interval.
+ */
+static int
+svm_restore_map_attr_segment(struct amdgpu_svm *svm,
+                            struct mm_struct *mm,
+                            const struct amdgpu_svm_attrs *attrs,
+                            unsigned long start,
+                            unsigned long end)
+{
+       unsigned long pos = start;
+       int saved_ret = 0;
+       struct vm_area_struct *vma;
+       unsigned long chunk_start, chunk_end;
+       int ret;
+
+       amdgpu_svm_assert_locked(svm);
+
+       while (pos < end) {
+               mmap_read_lock(mm);
+               vma = find_vma_intersection(mm, pos, end);
+               if (vma) {
+                       chunk_start = max(vma->vm_start, pos);
+                       chunk_end = min(vma->vm_end, end);
+               }
+               mmap_read_unlock(mm);
+
+               if (!vma)
+                       break;
+
+               ret = amdgpu_svm_range_map_attrs(svm, attrs,
+                                                chunk_start, chunk_end);
+               if (ret == -ENOENT || ret == -EFAULT ||
+                   ret == -EPERM || ret == -EINVAL ||
+                   ret == -EHWPOISON) {
+                       AMDGPU_SVM_TRACE("%s skip permanent [0x%lx-0x%lx) 
ret=%d\n",
+                                        __func__, chunk_start, chunk_end, ret);
+               } else if (ret) {
+                       AMDGPU_SVM_ERR("%s failed [0x%lx-0x%lx) ret=%d\n",
+                                      __func__, chunk_start, chunk_end, ret);
+                       saved_ret = ret;
+               }
+
+               pos = chunk_end;
+       }
+
+       return saved_ret;
+}
+
+static int
+svm_restore_map_interval(struct amdgpu_svm *svm,
+                        unsigned long start_page,
+                        unsigned long last_page)
+{
+       struct amdgpu_svm_attr_tree *attr_tree = svm->attr_tree;
+       struct mm_struct *mm = svm->gpusvm.mm;
+       unsigned long cursor = start_page;
+       int saved_ret = 0;
+       int ret;
+
+       amdgpu_svm_assert_locked(svm);
+
+       if (!mmget_not_zero(mm))
+               return -ESRCH;
+
+       while (cursor <= last_page) {
+               struct amdgpu_svm_attr_range *attr_range;
+               struct amdgpu_svm_attrs attrs;
+               unsigned long seg_start, seg_last;
+
+               mutex_lock(&attr_tree->lock);
+               attr_range = amdgpu_svm_attr_get_bounds_locked(attr_tree, 
cursor,
+                                                              &seg_start, 
&seg_last);
+               if (attr_range)
+                       attrs = attr_range->attrs;
+               mutex_unlock(&attr_tree->lock);
+
+               seg_last = min(seg_last, last_page);
+               if (attr_range && amdgpu_svm_attr_has_access(attrs.access)) {
+
+                       ret = svm_restore_map_attr_segment(svm, mm, &attrs,
+                                                          cursor << PAGE_SHIFT,
+                                                          (seg_last + 1) << 
PAGE_SHIFT);
+                       if (ret)
+                               saved_ret = ret;
+               }
+
+               if (seg_last >= last_page)
+                       break;
+               cursor = seg_last + 1;
+       }
+
+       mmput(mm);
+
+       return saved_ret;
+}
+
+static void
+svm_restore_enqueue_unmapped(struct amdgpu_svm *svm,
+                            unsigned long start_page,
+                            unsigned long last_page)
+{
+       struct drm_gpusvm_notifier *notifier;
+       struct drm_gpusvm_range *r;
+       unsigned long start = start_page << PAGE_SHIFT;
+       unsigned long end = (last_page + 1) << PAGE_SHIFT;
+
+       amdgpu_svm_assert_locked(svm);
+
+       drm_gpusvm_for_each_notifier(notifier, &svm->gpusvm, start, end) {
+               r = NULL;
+               drm_gpusvm_for_each_range(r, notifier, start, end) {
+                       struct amdgpu_svm_range *svm_range =
+                               to_amdgpu_svm_range(r);
+                       unsigned long rs, rl;
+
+                       if (READ_ONCE(svm_range->gpu_mapped))
+                               continue;
+
+                       rs = drm_gpusvm_range_start(r) >> PAGE_SHIFT;
+                       rl = (drm_gpusvm_range_end(r) >> PAGE_SHIFT) - 1;
+
+                       svm_restore_enqueue_work(svm, svm_range, rs, rl,
+                                    AMDGPU_SVM_RANGE_OP_RESTORE);
+               }
+       }
+}
+
+void amdgpu_svm_restore_gc_work_func(struct work_struct *w)
+{
+       struct amdgpu_svm_gc *gc = container_of(w, struct amdgpu_svm_gc, work);
+       struct amdgpu_svm *svm = container_of(gc, struct amdgpu_svm, gc);
+       struct amdgpu_svm_range_op_ctx op_ctx;
+       struct drm_gpusvm_ctx ctx = { .in_notifier = false };
+       unsigned long range_start_page;
+       unsigned long range_last_page;
+       bool partial;
+       int ret;
+
+       spin_lock(&svm->work_lock);
+       while (amdgpu_svm_range_dequeue_locked(svm, &svm->gc.list,
+                                              &op_ctx)) {
+               spin_unlock(&svm->work_lock);
+               range_start_page = drm_gpusvm_range_start(&op_ctx.range->base) 
>> PAGE_SHIFT;
+               range_last_page = (drm_gpusvm_range_end(&op_ctx.range->base) >> 
PAGE_SHIFT) - 1;
+               partial = op_ctx.range->base.pages.flags.partial_unmap;
+               ret = 0;
+
+               WARN_ON(!UNMAP_WORK(op_ctx.pending_ops));
+
+               down_write(&svm->svm_lock);
+
+               if (partial)
+                       amdgpu_svm_range_evict(svm, &op_ctx.range->base);
+
+               amdgpu_svm_range_remove(svm, op_ctx.range, &ctx);
+
+               /* Remove the range immediately prevent the stale range block 
the
+                * rebuild.
+                */
+               drm_gpusvm_range_put(&op_ctx.range->base);
+               op_ctx.range = NULL;
+
+               if (partial) {
+                       /* Rebuild the valid area if partial unmap,
+                        * cause gc always remove the entire range.
+                        */
+                       ret = svm_restore_map_interval(svm, range_start_page,
+                                          range_last_page);
+                       if (ret)
+                               svm_restore_enqueue_unmapped(
+                                       svm, range_start_page,
+                                       range_last_page);
+               }
+
+               up_write(&svm->svm_lock);
+
+               spin_lock(&svm->work_lock);
+       }
+       spin_unlock(&svm->work_lock);
+
+       if (atomic_read(&svm->restore.evicted_ranges))
+               queue_delayed_work(svm->restore.wq, &svm->restore.work,
+                                  
msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
+}
+
 int amdgpu_svm_restore_init(struct amdgpu_svm *svm,
                            void (*begin)(struct amdgpu_svm *),
                            void (*end)(struct amdgpu_svm *))
-- 
2.34.1

[RFC 5/8] drm/amdgpu: implement xnack-off GC work function

Reply via email to