From: Honglei Huang <[email protected]>

Implement the GPU page table mapping core for SVM ranges:

- PTE flag computation per GC IP version (9.4.x, 11.x, 12.x) with
  coherency mode selection (UC/NC/CC/RW) based on SVM flags
- GPU PTE update helpers using amdgpu_vm_update_range with DMA
  address coalescing across contiguous pagemap entries
- Range mapping loop: find_or_insert via drm_gpusvm, get_pages,
  validate under notifier lock, update GPU PTEs, flush TLB
- Attribute-aware mapping: walk the attr tree to map only accessible
  ranges with correct PTE flags
- Attribute change handler: detect trigger types and remap intervals
  when PTE flags, mapping flags, or access state changes

Signed-off-by: Honglei Huang <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_svm_range.c | 539 ++++++++++++++++++
 1 file changed, 539 insertions(+)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_svm_range.c

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_svm_range.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_svm_range.c
new file mode 100644
index 000000000..b3bd4e2e6
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_svm_range.c
@@ -0,0 +1,539 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright 2026 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "amdgpu_svm.h"
+#include "amdgpu_svm_attr.h"
+#include "amdgpu_svm_range.h"
+#include "amdgpu.h"
+#include "amdgpu_amdkfd.h"
+#include "amdgpu_vm.h"
+
+#include <drm/drm_exec.h>
+#include <drm/drm_pagemap.h>
+
+#include <linux/mmu_notifier.h>
+#include <uapi/linux/kfd_ioctl.h>
+
+enum amdgpu_svm_range_queue_op {
+       AMDGPU_SVM_RANGE_OP_RESTORE = 0,
+       AMDGPU_SVM_RANGE_OP_UNMAP = 1,
+};
+
+enum amdgpu_svm_range_pending_op {
+       AMDGPU_SVM_RANGE_PENDING_OP_NONE    = 0,
+       AMDGPU_SVM_RANGE_PENDING_OP_UNMAP   = BIT(0),
+       AMDGPU_SVM_RANGE_PENDING_OP_RESTORE = BIT(1),
+};
+
+#define UNMAP_WORK(ops) ((ops) & AMDGPU_SVM_RANGE_PENDING_OP_UNMAP)
+
+#define RESTORE_WORK(ops) ((ops) & AMDGPU_SVM_RANGE_PENDING_OP_RESTORE)
+
+#define NEED_REBUILD(svm) (!(svm)->xnack_enabled)
+
+enum amdgpu_svm_range_notifier_op {
+       AMDGPU_SVM_RANGE_NOTIFIER_CLEAR_PTE = BIT(0),
+       AMDGPU_SVM_RANGE_NOTIFIER_QUEUE_INTERVAL = BIT(1),
+};
+
+struct range_pending_op_ctx {
+       struct amdgpu_svm_range *range;
+       unsigned long start;
+       unsigned long last;
+       uint8_t pending_ops;
+};
+
+#define AMDGPU_SVM_RANGE_RESTORE_DELAY_MS 1
+#define AMDGPU_SVM_RANGE_WQ_NAME "amdgpu_svm_range"
+#define AMDGPU_SVM_RESTORE_WQ_NAME "amdgpu_svm_restore"
+
+static void
+amdgpu_svm_range_enqueue(struct amdgpu_svm *svm,
+                        struct amdgpu_svm_range *range,
+                        unsigned long start,
+                        unsigned long last,
+                        enum amdgpu_svm_range_queue_op op);
+
+static inline bool
+range_has_access(enum amdgpu_svm_attr_access access)
+{
+       return access == AMDGPU_SVM_ACCESS_ENABLE ||
+              access == AMDGPU_SVM_ACCESS_IN_PLACE;
+}
+
+static void
+range_invalidate_gpu_mapping(struct drm_gpusvm_range *range)
+{
+       WRITE_ONCE(to_amdgpu_svm_range(range)->gpu_mapped, false);
+}
+
+static bool
+range_attr_match(struct drm_gpusvm_range *range,
+                const struct amdgpu_svm_attrs *attrs,
+                uint64_t pte_flags)
+{
+       struct amdgpu_svm_range *r = to_amdgpu_svm_range(range);
+
+       if (!READ_ONCE(r->gpu_mapped))
+               return false;
+
+       return READ_ONCE(r->pte_flags) == pte_flags &&
+              READ_ONCE(r->attr_flags) == attrs->flags;
+}
+
+static bool
+range_pages_valid(struct amdgpu_svm *svm,
+                 struct drm_gpusvm_range *range)
+{
+       lockdep_assert_held(&svm->gpusvm.notifier_lock);
+
+       if (range->pages.flags.unmapped || range->pages.flags.partial_unmap)
+               return false;
+
+       return drm_gpusvm_range_pages_valid(&svm->gpusvm, range);
+}
+
+static uint64_t
+amdgpu_svm_range_attr_pte_flags(struct amdgpu_svm *svm,
+                           const struct amdgpu_svm_attrs *attrs)
+{
+       /* WA/POC: a simple pte flags func */
+       uint32_t gc_ip_version = amdgpu_ip_version(svm->adev, GC_HWIP, 0);
+       uint32_t flags = attrs->flags;
+       uint32_t mapping_flags = 0;
+       uint64_t pte_flags;
+       bool coherent = flags & (AMDGPU_SVM_FLAG_COHERENT |
+                                AMDGPU_SVM_FLAG_EXT_COHERENT);
+       bool ext_coherent = flags & AMDGPU_SVM_FLAG_EXT_COHERENT;
+       bool snoop = true;
+       unsigned int mtype_local;
+
+       switch (gc_ip_version) {
+       case IP_VERSION(9, 4, 1):
+       case IP_VERSION(9, 4, 2):
+               mapping_flags |= coherent ?
+                       AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
+               break;
+       case IP_VERSION(9, 4, 3):
+       case IP_VERSION(9, 4, 4):
+       case IP_VERSION(9, 5, 0):
+               if (ext_coherent)
+                       mtype_local = AMDGPU_VM_MTYPE_CC;
+               else
+                       mtype_local = amdgpu_mtype_local == 1 ? 
AMDGPU_VM_MTYPE_NC :
+                               amdgpu_mtype_local == 2 ? AMDGPU_VM_MTYPE_CC :
+                               AMDGPU_VM_MTYPE_RW;
+               if (svm->adev->flags & AMD_IS_APU) {
+                       if (num_possible_nodes() <= 1)
+                               mapping_flags |= mtype_local;
+                       else
+                               mapping_flags |= ext_coherent ?
+                                       AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
+               } else {
+                       if (gc_ip_version < IP_VERSION(9, 5, 0) || ext_coherent)
+                               mapping_flags |= AMDGPU_VM_MTYPE_UC;
+                       else
+                               mapping_flags |= AMDGPU_VM_MTYPE_NC;
+               }
+               break;
+       case IP_VERSION(11, 0, 0):
+       case IP_VERSION(11, 0, 1):
+       case IP_VERSION(11, 0, 2):
+       case IP_VERSION(11, 0, 3):
+       case IP_VERSION(11, 0, 4):
+       case IP_VERSION(11, 5, 0):
+       case IP_VERSION(11, 5, 1):
+       case IP_VERSION(11, 5, 2):
+       case IP_VERSION(11, 5, 3):
+               mapping_flags |= coherent ?
+                       AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
+               break;
+       case IP_VERSION(12, 0, 0):
+       case IP_VERSION(12, 0, 1):
+               mapping_flags |= AMDGPU_VM_MTYPE_NC;
+               break;
+       default:
+               mapping_flags |= coherent ?
+                       AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
+               break;
+       }
+
+       if (flags & AMDGPU_SVM_FLAG_GPU_EXEC)
+               mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE;
+
+       pte_flags = AMDGPU_PTE_VALID | AMDGPU_PTE_SYSTEM;
+       pte_flags |= snoop ? AMDGPU_PTE_SNOOPED : 0;
+       if (gc_ip_version >= IP_VERSION(12, 0, 0))
+               pte_flags |= AMDGPU_PTE_IS_PTE;
+
+       amdgpu_gmc_get_vm_pte(svm->adev, svm->vm, NULL, mapping_flags, 
&pte_flags);
+       pte_flags |= AMDGPU_PTE_READABLE;
+       if (!(flags & AMDGPU_SVM_FLAG_GPU_RO))
+               pte_flags |= AMDGPU_PTE_WRITEABLE;
+
+       return pte_flags;
+}
+
+static int amdgpu_svm_range_lock_vm_pd(struct amdgpu_svm *svm, struct drm_exec 
*exec)
+{
+       int ret;
+
+       drm_exec_init(exec, DRM_EXEC_IGNORE_DUPLICATES, 0);
+       drm_exec_until_all_locked(exec) {
+               ret = amdgpu_vm_lock_pd(svm->vm, exec, 1);
+               drm_exec_retry_on_contention(exec);
+               if (ret) {
+                       drm_exec_fini(exec);
+                       return ret;
+               }
+       }
+
+       return 0;
+}
+
+static int
+amdgpu_svm_range_update_gpu(struct amdgpu_svm *svm, unsigned long start_page,
+                          unsigned long last_page, uint64_t pte_flags,
+                          dma_addr_t *pages_addr, bool flush_tlb,
+                          bool update_pdes, bool wait_fence)
+{
+       struct drm_exec exec;
+       struct dma_fence *fence = NULL;
+       int ret;
+
+       ret = amdgpu_svm_range_lock_vm_pd(svm, &exec);
+       if (ret)
+               return ret;
+
+       ret = amdgpu_vm_update_range(svm->adev, svm->vm, false, false,
+                                    flush_tlb, true,
+                                    NULL, start_page, last_page, pte_flags, 0, 
0,
+                                    NULL, pages_addr, wait_fence ? &fence : 
NULL);
+       if (!ret && wait_fence && fence) {
+               ret = dma_fence_wait(fence, false);
+               if (ret < 0)
+                       AMDGPU_SVM_TRACE("wait unmap fence failed: ret=%d 
[0x%lx-0x%lx]-0x%lx\n",
+                                        ret, start_page, last_page,
+                                        last_page - start_page + 1);
+       }
+       if (!ret && update_pdes)
+               ret = amdgpu_vm_update_pdes(svm->adev, svm->vm, false);
+
+       dma_fence_put(fence);
+       drm_exec_fini(&exec);
+       return ret;
+}
+
+static int
+amdgpu_svm_range_update_gpu_range(struct amdgpu_svm *svm,
+                                 struct drm_gpusvm_range *range,
+                                 uint64_t pte_flags,
+                                 bool flush_tlb,
+                                 bool wait_fence,
+                                 struct dma_fence **fence)
+{
+       lockdep_assert_held(&svm->gpusvm.notifier_lock);
+
+       const unsigned long range_start_page = drm_gpusvm_range_start(range) >> 
PAGE_SHIFT;
+       const unsigned long range_end_page = drm_gpusvm_range_end(range) >> 
PAGE_SHIFT;
+       const unsigned long npages = range_end_page - range_start_page;
+       unsigned long mapped_pages = 0;
+       unsigned long dma_idx = 0;
+       int ret;
+
+       if (!range->pages.dma_addr || !npages)
+               return -EINVAL;
+
+       while (mapped_pages < npages) {
+               const struct drm_pagemap_addr *entry = 
&range->pages.dma_addr[dma_idx++];
+               unsigned long seg_pages = min_t(unsigned long, 1UL << 
entry->order,
+                                               npages - mapped_pages);
+               dma_addr_t seg_addr = entry->addr;
+               unsigned long start_page, last_page;
+               bool is_last_seg;
+
+               if (entry->proto != DRM_INTERCONNECT_SYSTEM)
+                       return -EOPNOTSUPP;
+
+               while (mapped_pages + seg_pages < npages) {
+                       const struct drm_pagemap_addr *next = 
&range->pages.dma_addr[dma_idx];
+                       unsigned long next_pages = min_t(unsigned long,
+                                                        1UL << next->order,
+                                                        npages - (mapped_pages 
+ seg_pages));
+
+                       if (next->proto != entry->proto ||
+                           next->addr != seg_addr + ((dma_addr_t)seg_pages << 
PAGE_SHIFT))
+                               break;
+
+                       seg_pages += next_pages;
+                       dma_idx++;
+               }
+
+               start_page = range_start_page + mapped_pages;
+               last_page = start_page + seg_pages - 1;
+               is_last_seg = mapped_pages + seg_pages == npages;
+
+               ret = amdgpu_vm_update_range(svm->adev, svm->vm, false, false,
+                                            flush_tlb && is_last_seg, true, 
NULL,
+                                            start_page, last_page, pte_flags,
+                                            0, seg_addr, NULL, NULL,
+                                            wait_fence && is_last_seg ? fence 
: NULL);
+               if (ret)
+                       return ret;
+
+               mapped_pages += seg_pages;
+       }
+
+       return 0;
+}
+
+static int
+amdgpu_svm_range_map(struct amdgpu_svm *svm,
+                      unsigned long start,
+                      unsigned long end,
+                      const struct amdgpu_svm_attrs *attrs,
+                      const struct drm_gpusvm_ctx *gpusvm_ctx,
+                      uint64_t pte_flags)
+{
+       unsigned long addr = start;
+       int ret;
+
+       while (addr < end) {
+               struct drm_exec exec;
+               struct drm_gpusvm_ctx map_ctx;
+               struct drm_gpusvm_range *range;
+               struct dma_fence *fence = NULL;
+               unsigned long vma_start;
+               unsigned long next_addr;
+               uint64_t range_pte_flags;
+               unsigned int flags;
+               bool skip_map;
+
+               vma_start = drm_gpusvm_find_vma_start(&svm->gpusvm, addr, end);
+               if (vma_start > addr)
+                       return -EFAULT;
+
+               map_ctx = *gpusvm_ctx;
+retry:
+               range = drm_gpusvm_range_find_or_insert(&svm->gpusvm, addr,
+                                                       vma_start, end,
+                                                       &map_ctx);
+               if (IS_ERR(range)) {
+                       ret = PTR_ERR(range);
+                       /*
+                        * drm gpu svm deny RO when VMA is writeable
+                        * but some UMD test does not set RO in readonly MM VMA
+                        * so set read only when ret == -EPERM and retry
+                        */
+                       if (ret == -EPERM && !map_ctx.read_only) {
+                               map_ctx.read_only = true;
+                               goto retry;
+                       }
+                       return ret;
+               }
+
+               next_addr = drm_gpusvm_range_end(range);
+               if (next_addr <= addr)
+                       return -EINVAL;
+
+               range_pte_flags = map_ctx.read_only ?
+                       (pte_flags & ~AMDGPU_PTE_WRITEABLE) : pte_flags;
+
+               skip_map = range_attr_match(range, attrs, range_pte_flags);
+
+               AMDGPU_SVM_TRACE("range_map: [0x%lx-0x%lx] skip=%d 
pte=0x%llx\n",
+                                addr, next_addr, skip_map ? 1 : 0, 
range_pte_flags);
+
+               if (!skip_map) {
+                       ret = drm_gpusvm_range_get_pages(&svm->gpusvm, range, 
&map_ctx);
+                       if (ret)
+                               return ret;
+               }
+
+               ret = amdgpu_svm_range_lock_vm_pd(svm, &exec);
+               if (ret)
+                       return ret;
+
+               flags = memalloc_noreclaim_save();
+               drm_gpusvm_notifier_lock(&svm->gpusvm);
+               if (skip_map) {
+                       /* slow path must validate under notifier lock */
+                       if (!range_attr_match(range, attrs, range_pte_flags) ||
+                           !range_pages_valid(svm, range)) {
+                               range_invalidate_gpu_mapping(range);
+                               ret = -EAGAIN;
+                       } else {
+                               ret = 0;
+                       }
+               } else if (!range_pages_valid(svm, range)) {
+                       /* not protected by mmap lock, maybe changed by mmu 
notifier */
+                       ret = -EAGAIN;
+               } else {
+                       ret = amdgpu_svm_range_update_gpu_range(svm, range,
+                                                               range_pte_flags,
+                                                               true, true, 
&fence);
+               }
+               drm_gpusvm_notifier_unlock(&svm->gpusvm);
+               memalloc_noreclaim_restore(flags);
+
+               if (!ret && fence)
+                       dma_fence_wait(fence, false);
+
+               dma_fence_put(fence);
+
+               if (!ret)
+                       ret = amdgpu_vm_update_pdes(svm->adev, svm->vm, false);
+               if (!ret) {
+                       svm->flush_tlb(svm);
+                       WRITE_ONCE(to_amdgpu_svm_range(range)->pte_flags, 
range_pte_flags);
+                       WRITE_ONCE(to_amdgpu_svm_range(range)->attr_flags, 
attrs->flags);
+                       WRITE_ONCE(to_amdgpu_svm_range(range)->gpu_mapped, 
true);
+               }
+               drm_exec_fini(&exec);
+
+               if (ret)
+                       return ret;
+
+               addr = next_addr;
+       }
+
+       return 0;
+}
+
+static int
+amdgpu_svm_range_map_interval(struct amdgpu_svm *svm, unsigned long start_page,
+                               unsigned long last_page,
+                               const struct amdgpu_svm_attrs *attrs)
+{
+       struct drm_gpusvm_ctx gpusvm_ctx = {
+               .read_only = !!(attrs->flags & AMDGPU_SVM_FLAG_GPU_RO),
+       };
+       unsigned long start = start_page << PAGE_SHIFT;
+       unsigned long end = (last_page + 1) << PAGE_SHIFT;
+       uint64_t pte_flags;
+       int ret;
+
+       pte_flags = amdgpu_svm_range_attr_pte_flags(svm, attrs);
+
+       ret = amdgpu_svm_range_map(svm, start, end, attrs, &gpusvm_ctx,
+                                  pte_flags);
+       if (ret)
+               AMDGPU_SVM_TRACE("map_interval failed: ret=%d 
[0x%lx-0x%lx)-0x%lx\n",
+                                ret, start, end, end - start);
+
+       return ret;
+}
+
+int
+amdgpu_svm_range_map_attr_ranges(struct amdgpu_svm *svm,
+                                unsigned long start_page,
+                                unsigned long last_page)
+{
+       lockdep_assert_held_write(&svm->svm_lock);
+
+       struct amdgpu_svm_attr_tree *attr_tree = svm->attr_tree;
+       unsigned long cursor = start_page;
+
+       while (cursor <= last_page) {
+               struct amdgpu_svm_attrs attrs;
+               unsigned long seg_last;
+               unsigned long next;
+               int ret;
+
+               mutex_lock(&attr_tree->lock);
+               amdgpu_svm_attr_lookup_page_locked(attr_tree, cursor, &attrs,
+                                                  &seg_last);
+               mutex_unlock(&attr_tree->lock);
+
+               seg_last = min(seg_last, last_page);
+               if (range_has_access(attrs.access)) {
+                       /* map may fail here cause no vma or access deny */
+                       ret = amdgpu_svm_range_map_interval(svm, cursor, 
seg_last,
+                                                           &attrs);
+                       if (ret)
+                               return ret;
+               }
+
+               if (seg_last == ULONG_MAX || seg_last == last_page)
+                       break;
+
+               next = seg_last + 1;
+               if (next <= cursor)
+                       break;
+               cursor = next;
+       }
+
+       return 0;
+}
+
+int amdgpu_svm_range_apply_attr_change(struct amdgpu_svm *svm,
+                                      unsigned long start,
+                                      unsigned long last,
+                                      uint32_t trigger,
+                                      const struct amdgpu_svm_attrs 
*prev_attrs,
+                                      const struct amdgpu_svm_attrs *new_attrs)
+{
+       lockdep_assert_held_write(&svm->svm_lock);
+
+       bool old_access, new_access;
+       bool update_mapping = false;
+
+       old_access = range_has_access(prev_attrs->access);
+       new_access = range_has_access(new_attrs->access);
+
+       AMDGPU_SVM_TRACE("attr change trigger=0x%x old_access=%d new_access=%d 
[0x%lx-0x%lx]-0x%lx, xnack=%d\n",
+                        trigger, old_access, new_access, start, last, last - 
start + 1,
+                        svm->xnack_enabled ? 1 : 0);
+
+       if (trigger & AMDGPU_SVM_ATTR_TRIGGER_ACCESS_CHANGE) {
+               if (!new_access && old_access) {
+                       /*
+                        * Do nothing align with kfd svm
+                        * TODO: unmap ranges from GPU that lost access
+                        */
+                       AMDGPU_SVM_TRACE("skip unmap ioctl operation 
[0x%lx-0x%lx]-0x%lx\n",
+                                        start, last, last - start + 1);
+               } else if (new_access) {
+                       if (NEED_REBUILD(svm) ||
+                           (new_attrs->flags & 
AMDGPU_SVM_FLAG_GPU_ALWAYS_MAPPED))
+                               update_mapping = true;
+               }
+       }
+
+       if ((trigger & (AMDGPU_SVM_ATTR_TRIGGER_PTE_FLAG_CHANGE |
+                       AMDGPU_SVM_ATTR_TRIGGER_MAPPING_FLAG_CHANGE)) &&
+           new_access)
+               update_mapping = true;
+
+       if (trigger & AMDGPU_SVM_ATTR_TRIGGER_LOCATION_CHANGE) {
+               /* TODO: add migration */
+       }
+
+       if (!update_mapping)
+               return 0;
+
+       AMDGPU_SVM_TRACE("mapping update: remap interval [0x%lx-0x%lx]-0x%lx\n",
+                        start, last, last - start + 1);
+       return amdgpu_svm_range_map_interval(svm, start, last, new_attrs);
+}
-- 
2.34.1

Reply via email to