[PATCH] drm/amdgpu: Show retry fault message if process xnack on
If vm_context_cntl set xnack on, then GPU vm fault has retry_fault bit set, but the driver select xnack on or off path depending on per process xnack setting which is also used to set qpd mem_config xnack on or off if KFD_SUPPORT_XNACK_PER_PROCESS. If process is xnack on, then GPU page fault show retry page fault message, otherwise show no-retry page fault message, to avoid misleading when debugging application page fault issue. The process lookup from pasid is done inside retry fault handler svm_range_restore_pages, add xnack_on parameter to pass process xnack setting back to amdgpu_vm_handle_fault and then to gmc interrupt handler to show vm fault message. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 5 +++-- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 7 --- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 4 +++- drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 2 +- 6 files changed, 13 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 64ddc87f7fb6..58f7ab193027 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2757,13 +2757,14 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) * GFX 9.4.3. * @addr: Address of the fault * @write_fault: true is write fault, false is read fault + * @xnack_on: return value, true if the process sets xnack on * * Try to gracefully handle a VM fault. Return true if the fault was handled and * shouldn't be reported any more. */ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, u32 vmid, u32 node_id, uint64_t addr, - bool write_fault) + bool write_fault, bool *xnack_on) { bool is_compute_context = false; struct amdgpu_bo *root; @@ -2788,7 +2789,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, addr /= AMDGPU_GPU_PAGE_SIZE; if (is_compute_context && !svm_range_restore_pages(adev, pasid, vmid, - node_id, addr, write_fault)) { + node_id, addr, write_fault, xnack_on)) { amdgpu_bo_unref(); return true; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index bc71b44387b2..7f364f0b9a60 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -549,7 +549,7 @@ void amdgpu_vm_put_task_info(struct amdgpu_task_info *task_info); bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, u32 vmid, u32 node_id, uint64_t addr, - bool write_fault); + bool write_fault, bool *xnack_on); void amdgpu_vm_set_task_info(struct amdgpu_vm *vm); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index d933e19e0cf5..2f0752376236 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -132,7 +132,7 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev, /* Try to handle the recoverable page faults by filling page * tables */ - if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr, write_fault)) + if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr, write_fault, NULL)) return 1; } diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 671a6766df5b..3db0f2304b6a 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -558,6 +558,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, uint32_t cam_index = 0; int ret, xcc_id = 0; uint32_t node_id; + bool xnack_on = false; node_id = entry->node_id; @@ -595,7 +596,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, cam_index = entry->src_data[2] & 0x3ff; ret = amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id, -addr, write_fault); +addr, write_fault, _on); WDOORBELL32(adev->irq.retry_cam_doorbell_index, cam_index); if (ret) return 1; @@ -618,7 +619,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, * tables */ if (amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id, -
Re: [PATCH] drm/amdkfd: Remove arbitrary timeout for hmm_range_fault
On 2024-05-02 08:42, James Zhu wrote: On 2024-05-01 18:56, Philip Yang wrote: On system with khugepaged enabled and user cases with THP buffer, the hmm_range_fault may takes > 15 seconds to return -EBUSY, the arbitrary timeout value is not accurate, cause memory allocation failure. Remove the arbitrary timeout value, return EAGAIN to application if hmm_range_fault return EBUSY, then userspace libdrm and Thunk will call ioctl again. Change EAGAIN to debug message as this is not error. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 5 - drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c | 12 +++- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 5 + 3 files changed, 8 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 54198c3928c7..02696c2102f1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1087,7 +1087,10 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr, ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages, ); if (ret) { - pr_err("%s: Failed to get user pages: %d\n", __func__, ret); + if (ret == -EAGAIN) + pr_debug("Failed to get user pages, try again\n"); + else + pr_err("%s: Failed to get user pages: %d\n", __func__, ret); goto unregister_out; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c index 431ec72655ec..e36fede7f74c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c @@ -202,20 +202,12 @@ int amdgpu_hmm_range_get_pages(struct mmu_interval_notifier *notifier, pr_debug("hmm range: start = 0x%lx, end = 0x%lx", hmm_range->start, hmm_range->end); - /* Assuming 64MB takes maximum 1 second to fault page address */ - timeout = max((hmm_range->end - hmm_range->start) >> 26, 1UL); - timeout *= HMM_RANGE_DEFAULT_TIMEOUT; - timeout = jiffies + msecs_to_jiffies(timeout); + timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); [JZ] should we reduce MAX_WALK_BYTE to 64M in the meantime? From debug log, the range size is not related, 64MB range may takes same long time to return EBUSY too. retry: hmm_range->notifier_seq = mmu_interval_read_begin(notifier); r = hmm_range_fault(hmm_range); if (unlikely(r)) { - schedule(); [JZ] the above is for CPU stall WA, we may still need keep it. The timeout 1 second should be long enough for normal case, if hmm_range_fault returns EBUSY, we release mmap_read lock and return to user space, so don't need explicit schedule to fix the CPU stale warning. Will run overnight KFDTest LargestSysBufferTest on larger memory system to confirm if there is CPU stale message. Regards, Philip - /* - * FIXME: This timeout should encompass the retry from - * mmu_interval_read_retry() as well. - */ if (r == -EBUSY && !time_after(jiffies, timeout)) goto retry; goto out_free_pfns; @@ -247,6 +239,8 @@ int amdgpu_hmm_range_get_pages(struct mmu_interval_notifier *notifier, out_free_range: kfree(hmm_range); + if (r == -EBUSY) + r = -EAGAIN; return r; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
Re: [PATCH] drm/amdkfd: Remove arbitrary timeout for hmm_range_fault
On 2024-05-02 00:09, Chen, Xiaogang wrote: On 5/1/2024 5:56 PM, Philip Yang wrote: Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding. On system with khugepaged enabled and user cases with THP buffer, the hmm_range_fault may takes > 15 seconds to return -EBUSY, the arbitrary timeout value is not accurate, cause memory allocation failure. Remove the arbitrary timeout value, return EAGAIN to application if hmm_range_fault return EBUSY, then userspace libdrm and Thunk will call ioctl again. Wonder why letting user space do retry is better? Seems this issue is caused by hugepage merging, so how user space can avoid it? The issue is caused by khugepaged + 4 processes + sdma stalls test (to slow down sdma) + small_BAR + QPX mode, during overnight test, hmm_range_fault 180MB buffer may takes >15 seconds returns EBUSY, then alloc memory ioctl failed. Return EAGAIN, Thunk will call the alloc memory ioctl again, and we don't see the alloc memory failure. And applications may not use Thunk or libdrm, instead, use ioctl directly. If app calls ioctl directly, it should do the same thing, to call ioctl again if errno is EINTR or EAGAIN. Regards, Philip Regards Xiaogang Change EAGAIN to debug message as this is not error. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 5 - drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c | 12 +++- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 5 + 3 files changed, 8 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 54198c3928c7..02696c2102f1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1087,7 +1087,10 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr, ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages, ); if (ret) { - pr_err("%s: Failed to get user pages: %d\n", __func__, ret); + if (ret == -EAGAIN) + pr_debug("Failed to get user pages, try again\n"); + else + pr_err("%s: Failed to get user pages: %d\n", __func__, ret); goto unregister_out; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c index 431ec72655ec..e36fede7f74c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c @@ -202,20 +202,12 @@ int amdgpu_hmm_range_get_pages(struct mmu_interval_notifier *notifier, pr_debug("hmm range: start = 0x%lx, end = 0x%lx", hmm_range->start, hmm_range->end); - /* Assuming 64MB takes maximum 1 second to fault page address */ - timeout = max((hmm_range->end - hmm_range->start) >> 26, 1UL); - timeout *= HMM_RANGE_DEFAULT_TIMEOUT; - timeout = jiffies + msecs_to_jiffies(timeout); + timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); retry: hmm_range->notifier_seq = mmu_interval_read_begin(notifier); r = hmm_range_fault(hmm_range); if (unlikely(r)) { - schedule(); - /* - * FIXME: This timeout should encompass the retry from - * mmu_interval_read_retry() as well. - */
[PATCH] drm/amdkfd: Remove arbitrary timeout for hmm_range_fault
On system with khugepaged enabled and user cases with THP buffer, the hmm_range_fault may takes > 15 seconds to return -EBUSY, the arbitrary timeout value is not accurate, cause memory allocation failure. Remove the arbitrary timeout value, return EAGAIN to application if hmm_range_fault return EBUSY, then userspace libdrm and Thunk will call ioctl again. Change EAGAIN to debug message as this is not error. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 5 - drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c | 12 +++- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 5 + 3 files changed, 8 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 54198c3928c7..02696c2102f1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1087,7 +1087,10 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr, ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages, ); if (ret) { - pr_err("%s: Failed to get user pages: %d\n", __func__, ret); + if (ret == -EAGAIN) + pr_debug("Failed to get user pages, try again\n"); + else + pr_err("%s: Failed to get user pages: %d\n", __func__, ret); goto unregister_out; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c index 431ec72655ec..e36fede7f74c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c @@ -202,20 +202,12 @@ int amdgpu_hmm_range_get_pages(struct mmu_interval_notifier *notifier, pr_debug("hmm range: start = 0x%lx, end = 0x%lx", hmm_range->start, hmm_range->end); - /* Assuming 64MB takes maximum 1 second to fault page address */ - timeout = max((hmm_range->end - hmm_range->start) >> 26, 1UL); - timeout *= HMM_RANGE_DEFAULT_TIMEOUT; - timeout = jiffies + msecs_to_jiffies(timeout); + timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); retry: hmm_range->notifier_seq = mmu_interval_read_begin(notifier); r = hmm_range_fault(hmm_range); if (unlikely(r)) { - schedule(); - /* -* FIXME: This timeout should encompass the retry from -* mmu_interval_read_retry() as well. -*/ if (r == -EBUSY && !time_after(jiffies, timeout)) goto retry; goto out_free_pfns; @@ -247,6 +239,8 @@ int amdgpu_hmm_range_get_pages(struct mmu_interval_notifier *notifier, out_free_range: kfree(hmm_range); + if (r == -EBUSY) + r = -EAGAIN; return r; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 94f83be2232d..e7040f809f33 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -1670,11 +1670,8 @@ static int svm_range_validate_and_map(struct mm_struct *mm, readonly, owner, NULL, _range); WRITE_ONCE(p->svms.faulting_task, NULL); - if (r) { + if (r) pr_debug("failed %d to get svm range pages\n", r); - if (r == -EBUSY) - r = -EAGAIN; - } } else { r = -EFAULT; } -- 2.43.2
Re: [PATCH] drm/amd/amdkfd: Fix a resource leak in svm_range_validate_and_map()
On 2024-04-30 19:29, Ramesh Errabolu wrote: Analysis of code by Coverity, a static code analyser, has identified a resource leak in the symbol hmm_range. This leak occurs when one of the prior steps before it is released encounters an error. Signed-off-by: Ramesh Errabolu Reviewed-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 386875e6eb96..dcb1d5d3f860 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -1658,7 +1658,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm, start = map_start << PAGE_SHIFT; end = (map_last + 1) << PAGE_SHIFT; for (addr = start; !r && addr < end; ) { - struct hmm_range *hmm_range; + struct hmm_range *hmm_range = NULL; unsigned long map_start_vma; unsigned long map_last_vma; struct vm_area_struct *vma; @@ -1696,7 +1696,9 @@ static int svm_range_validate_and_map(struct mm_struct *mm, } svm_range_lock(prange); - if (!r && amdgpu_hmm_range_get_pages_done(hmm_range)) { + + // Free backing memory of hmm_range if it was initialized + if (hmm_range && amdgpu_hmm_range_get_pages_done(hmm_range)) { pr_debug("hmm update the range, need validate again\n"); r = -EAGAIN; }
[PATCH v6 1/5] drm/amdgpu: Support contiguous VRAM allocation
RDMA device with limited scatter-gather ability requires contiguous VRAM buffer allocation for RDMA peer direct support. Add a new KFD alloc memory flag and store as bo alloc flag AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS. When pin this bo to export for RDMA peerdirect access, this will set TTM_PL_FLAG_CONTIFUOUS flag, and ask VRAM buddy allocator to get contiguous VRAM. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 include/uapi/linux/kfd_ioctl.h | 1 + 2 files changed, 5 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index f672205243e0..02d66faaade5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1712,6 +1712,10 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( alloc_flags = AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; alloc_flags |= (flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) ? AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0; + + /* For contiguous VRAM allocation */ + if (flags & KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS) + alloc_flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; } xcp_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ? 0 : fpriv->xcp_id; diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 2040a470ddb4..d09c4a18e571 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -407,6 +407,7 @@ struct kfd_ioctl_acquire_vm_args { #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT (1 << 26) #define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED (1 << 25) #define KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT (1 << 24) +#define KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS (1 << 23) /* Allocate memory for later SVM (shared virtual memory) mapping. * -- 2.43.2
[PATCH v6 5/5] drm/amdkfd: Bump kfd version for contiguous VRAM allocation
Bump the kfd ioctl minor version to delcare the contiguous VRAM allocation flag support. Signed-off-by: Philip Yang --- include/uapi/linux/kfd_ioctl.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index d09c4a18e571..f8e9d3c1d117 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -41,9 +41,10 @@ * - 1.13 - Add debugger API * - 1.14 - Update kfd_event_data * - 1.15 - Enable managing mappings in compute VMs with GEM_VA ioctl + * - 1.16 - Add contiguous VRAM allocation flag */ #define KFD_IOCTL_MAJOR_VERSION 1 -#define KFD_IOCTL_MINOR_VERSION 15 +#define KFD_IOCTL_MINOR_VERSION 16 struct kfd_ioctl_get_version_args { __u32 major_version;/* from KFD */ -- 2.43.2
[PATCH v6 4/5] drm/amdkfd: Evict BO itself for contiguous allocation
If the BO pages pinned for RDMA is not contiguous on VRAM, evict it to system memory first to free the VRAM space, then allocate contiguous VRAM space, and then move it from system memory back to VRAM. v6: user context should use interruptible call (Felix) Signed-off-by: Philip Yang --- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 19 ++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 02d66faaade5..acc825b84113 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1470,13 +1470,30 @@ static int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo *bo, u32 domain) if (unlikely(ret)) return ret; + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) { + /* +* If bo is not contiguous on VRAM, move to system memory first to ensure +* we can get contiguous VRAM space after evicting other BOs. +*/ + if (!(bo->tbo.resource->placement & TTM_PL_FLAG_CONTIGUOUS)) { + struct ttm_operation_ctx ctx = { true, false }; + + amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_GTT); + ret = ttm_bo_validate(>tbo, >placement, ); + if (unlikely(ret)) { + pr_debug("validate bo 0x%p to GTT failed %d\n", >tbo, ret); + goto out; + } + } + } + ret = amdgpu_bo_pin_restricted(bo, domain, 0, 0); if (ret) pr_err("Error in Pinning BO to domain: %d\n", domain); amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false); +out: amdgpu_bo_unreserve(bo); - return ret; } -- 2.43.2
[PATCH v6 0/5] Best effort contiguous VRAM allocation
This patch series implement new KFD memory alloc flag for best effort contiguous VRAM allocation, to support peer direct access RDMA device with limited scatter-gather dma capability. v2: rebase on patch ("drm/amdgpu: Modify the contiguous flags behaviour") to avoid adding the new GEM flag v3: add patch 2 to handle sg segment size limit (Christian) v4: remove the buddy block size limit from vram mgr because sg table creation already remove the limit, and resource uses u64 to handle block start, size (Christian) v5: remove patch 7 which is not for upstream, add AMDGPU prefix to the macro name. v6: use shorter flag name, use interruptible wait ctx, drop patch 5/6 (Felix) Philip Yang (5): drm/amdgpu: Support contiguous VRAM allocation drm/amdgpu: Handle sg size limit for contiguous allocation drm/amdgpu: Evict BOs from same process for contiguous allocation drm/amdkfd: Evict BO itself for contiguous allocation drm/amdkfd: Bump kfd version for contiguous VRAM allocation .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 23 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 12 +- include/uapi/linux/kfd_ioctl.h| 4 +++- 4 files changed, 33 insertions(+), 9 deletions(-) -- 2.43.2
[PATCH v6 3/5] drm/amdgpu: Evict BOs from same process for contiguous allocation
When TTM failed to alloc VRAM, TTM try evict BOs from VRAM to system memory then retry the allocation, this skips the KFD BOs from the same process because KFD require all BOs are resident for user queues. If TTM with TTM_PL_FLAG_CONTIGUOUS flag to alloc contiguous VRAM, allow TTM evict KFD BOs from the same process, this will evict the user queues first, and restore the queues later after contiguous VRAM allocation. Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 64f5001a7dc5..c21ea808f931 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1403,7 +1403,8 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo, */ dma_resv_for_each_fence(_cursor, bo->base.resv, DMA_RESV_USAGE_BOOKKEEP, f) { - if (amdkfd_fence_check_mm(f, current->mm)) + if (amdkfd_fence_check_mm(f, current->mm) && + !(place->flags & TTM_PL_FLAG_CONTIGUOUS)) return false; } -- 2.43.2
[PATCH v6 2/5] drm/amdgpu: Handle sg size limit for contiguous allocation
Define macro AMDGPU_MAX_SG_SEGMENT_SIZE 2GB, because struct scatterlist length is unsigned int, and some users of it cast to a signed int, so every segment of sg table is limited to size 2GB maximum. For contiguous VRAM allocation, don't limit the max buddy block size in order to get contiguous VRAM memory. To workaround the sg table segment size limit, allocate multiple segments if contiguous size is bigger than AMDGPU_MAX_SG_SEGMENT_SIZE. Signed-off-by: Philip Yang Reviewed-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 4be8b091099a..ebffb58ea53a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -31,6 +31,8 @@ #include "amdgpu_atomfirmware.h" #include "atom.h" +#define AMDGPU_MAX_SG_SEGMENT_SIZE (2UL << 30) + struct amdgpu_vram_reservation { u64 start; u64 size; @@ -532,9 +534,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, BUG_ON(min_block_size < mm->chunk_size); - /* Limit maximum size to 2GiB due to SG table limitations */ - size = min(remaining_size, 2ULL << 30); - + size = remaining_size; if ((size >= (u64)pages_per_block << PAGE_SHIFT) && !(size & (((u64)pages_per_block << PAGE_SHIFT) - 1))) min_block_size = (u64)pages_per_block << PAGE_SHIFT; @@ -675,7 +675,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, amdgpu_res_first(res, offset, length, ); while (cursor.remaining) { num_entries++; - amdgpu_res_next(, cursor.size); + amdgpu_res_next(, min(cursor.size, AMDGPU_MAX_SG_SEGMENT_SIZE)); } r = sg_alloc_table(*sgt, num_entries, GFP_KERNEL); @@ -695,7 +695,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, amdgpu_res_first(res, offset, length, ); for_each_sgtable_sg((*sgt), sg, i) { phys_addr_t phys = cursor.start + adev->gmc.aper_base; - size_t size = cursor.size; + unsigned long size = min(cursor.size, AMDGPU_MAX_SG_SEGMENT_SIZE); dma_addr_t addr; addr = dma_map_resource(dev, phys, size, dir, @@ -708,7 +708,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, sg_dma_address(sg) = addr; sg_dma_len(sg) = size; - amdgpu_res_next(, cursor.size); + amdgpu_res_next(, size); } return 0; -- 2.43.2
Re: [PATCH v5 1/6] drm/amdgpu: Support contiguous VRAM allocation
On 2024-04-23 18:17, Felix Kuehling wrote: On 2024-04-23 11:28, Philip Yang wrote: RDMA device with limited scatter-gather ability requires contiguous VRAM buffer allocation for RDMA peer direct support. Add a new KFD alloc memory flag and store as bo alloc flag AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS. When pin this bo to export for RDMA peerdirect access, this will set TTM_PL_FLAG_CONTIFUOUS flag, and ask VRAM buddy allocator to get contiguous VRAM. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 include/uapi/linux/kfd_ioctl.h | 1 + 2 files changed, 5 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 0ae9fd844623..ef9154043757 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1712,6 +1712,10 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( alloc_flags = AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; alloc_flags |= (flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) ? AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0; + + /* For contiguous VRAM allocation */ + if (flags & KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT) + alloc_flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; } xcp_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ? 0 : fpriv->xcp_id; diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 2040a470ddb4..c1394c162d4e 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -407,6 +407,7 @@ struct kfd_ioctl_acquire_vm_args { #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT (1 << 26) #define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED (1 << 25) #define KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT (1 << 24) +#define KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT (1 << 23) If I understand it correctly, AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS was redefined to mean "best effort". Maybe we can drop the explicit "BEST_EFFORT" from this flag as well to keep the name to a reasonable length. yes, AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS is redefined, to implement "best effort" without adding new upstream GEM flag, so we may get scattered allocation if contiguous allocation failed. If we drop the "BEST_EFFORT" from flag name, this may mislead the users. Regards, Philip Regards, Felix /* Allocate memory for later SVM (shared virtual memory) mapping. *
Re: [PATCH v5 4/6] drm/amdkfd: Evict BO itself for contiguous allocation
On 2024-04-23 18:15, Felix Kuehling wrote: On 2024-04-23 11:28, Philip Yang wrote: If the BO pages pinned for RDMA is not contiguous on VRAM, evict it to system memory first to free the VRAM space, then allocate contiguous VRAM space, and then move it from system memory back to VRAM. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 +++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index ef9154043757..5d118e5580ce 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1470,13 +1470,27 @@ static int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo *bo, u32 domain) if (unlikely(ret)) return ret; + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) { + /* + * If bo is not contiguous on VRAM, move to system memory first to ensure + * we can get contiguous VRAM space after evicting other BOs. + */ + if (!(bo->tbo.resource->placement & TTM_PL_FLAG_CONTIGUOUS)) { + ret = amdgpu_amdkfd_bo_validate(bo, AMDGPU_GEM_DOMAIN_GTT, false); amdgpu_amdkfd_bo_validate is meant for use in kernel threads. It always runs uninterruptible. I believe pin_bo runs in the context of ioctls from user mode. So it should be interruptible. yes, pin_bo is in the context of user mode, from KFD alloc memory or from rdma driver get pages, should use interruptible wait. amdgpu_amdkfd_bo_validate is currently used by kernel threads and ioctl amdgpu_amdkfd_add_gws_to_process (this seems bug), does it make sense to add parameter interruptible, then we can remove many duplicate code amdgpu_bo_placement_from_domain + ttm_bo_validate or I can fix it here and leave the cleanup and bug fix in the future? Regards, Philip Regards, Felix + if (unlikely(ret)) { + pr_debug("validate bo 0x%p to GTT failed %d\n", >tbo, ret); + goto out; + } + } + } + ret = amdgpu_bo_pin_restricted(bo, domain, 0, 0); if (ret) pr_err("Error in Pinning BO to domain: %d\n", domain); amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false); +out: amdgpu_bo_unreserve(bo); - return ret; }
[PATCH v5 6/6] drm/amdkfd: Bump kfd version for contiguous VRAM allocation
Bump the kfd ioctl minor version to delcare the contiguous VRAM allocation flag support. Signed-off-by: Philip Yang --- include/uapi/linux/kfd_ioctl.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index c1394c162d4e..a5ebbe98ff7f 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -41,9 +41,10 @@ * - 1.13 - Add debugger API * - 1.14 - Update kfd_event_data * - 1.15 - Enable managing mappings in compute VMs with GEM_VA ioctl + * - 1.16 - Add contiguous VRAM allocation flag */ #define KFD_IOCTL_MAJOR_VERSION 1 -#define KFD_IOCTL_MINOR_VERSION 15 +#define KFD_IOCTL_MINOR_VERSION 16 struct kfd_ioctl_get_version_args { __u32 major_version;/* from KFD */ -- 2.43.2
[PATCH v5 4/6] drm/amdkfd: Evict BO itself for contiguous allocation
If the BO pages pinned for RDMA is not contiguous on VRAM, evict it to system memory first to free the VRAM space, then allocate contiguous VRAM space, and then move it from system memory back to VRAM. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 +++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index ef9154043757..5d118e5580ce 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1470,13 +1470,27 @@ static int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo *bo, u32 domain) if (unlikely(ret)) return ret; + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) { + /* +* If bo is not contiguous on VRAM, move to system memory first to ensure +* we can get contiguous VRAM space after evicting other BOs. +*/ + if (!(bo->tbo.resource->placement & TTM_PL_FLAG_CONTIGUOUS)) { + ret = amdgpu_amdkfd_bo_validate(bo, AMDGPU_GEM_DOMAIN_GTT, false); + if (unlikely(ret)) { + pr_debug("validate bo 0x%p to GTT failed %d\n", >tbo, ret); + goto out; + } + } + } + ret = amdgpu_bo_pin_restricted(bo, domain, 0, 0); if (ret) pr_err("Error in Pinning BO to domain: %d\n", domain); amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false); +out: amdgpu_bo_unreserve(bo); - return ret; } -- 2.43.2
[PATCH v5 2/6] drm/amdgpu: Handle sg size limit for contiguous allocation
Define macro MAX_SG_SEGMENT_SIZE 2GB, because struct scatterlist length is unsigned int, and some users of it cast to a signed int, so every segment of sg table is limited to size 2GB maximum. For contiguous VRAM allocation, don't limit the max buddy block size in order to get contiguous VRAM memory. To workaround the sg table segment size limit, allocate multiple segments if contiguous size is bigger than MAX_SG_SEGMENT_SIZE. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 4be8b091099a..ebffb58ea53a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -31,6 +31,8 @@ #include "amdgpu_atomfirmware.h" #include "atom.h" +#define AMDGPU_MAX_SG_SEGMENT_SIZE (2UL << 30) + struct amdgpu_vram_reservation { u64 start; u64 size; @@ -532,9 +534,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, BUG_ON(min_block_size < mm->chunk_size); - /* Limit maximum size to 2GiB due to SG table limitations */ - size = min(remaining_size, 2ULL << 30); - + size = remaining_size; if ((size >= (u64)pages_per_block << PAGE_SHIFT) && !(size & (((u64)pages_per_block << PAGE_SHIFT) - 1))) min_block_size = (u64)pages_per_block << PAGE_SHIFT; @@ -675,7 +675,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, amdgpu_res_first(res, offset, length, ); while (cursor.remaining) { num_entries++; - amdgpu_res_next(, cursor.size); + amdgpu_res_next(, min(cursor.size, AMDGPU_MAX_SG_SEGMENT_SIZE)); } r = sg_alloc_table(*sgt, num_entries, GFP_KERNEL); @@ -695,7 +695,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, amdgpu_res_first(res, offset, length, ); for_each_sgtable_sg((*sgt), sg, i) { phys_addr_t phys = cursor.start + adev->gmc.aper_base; - size_t size = cursor.size; + unsigned long size = min(cursor.size, AMDGPU_MAX_SG_SEGMENT_SIZE); dma_addr_t addr; addr = dma_map_resource(dev, phys, size, dir, @@ -708,7 +708,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, sg_dma_address(sg) = addr; sg_dma_len(sg) = size; - amdgpu_res_next(, cursor.size); + amdgpu_res_next(, size); } return 0; -- 2.43.2
[PATCH v5 3/6] drm/amdgpu: Evict BOs from same process for contiguous allocation
When TTM failed to alloc VRAM, TTM try evict BOs from VRAM to system memory then retry the allocation, this skips the KFD BOs from the same process because KFD require all BOs are resident for user queues. If TTM with TTM_PL_FLAG_CONTIGUOUS flag to alloc contiguous VRAM, allow TTM evict KFD BOs from the same process, this will evict the user queues first, and restore the queues later after contiguous VRAM allocation. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 851509c6e90e..c907d6005641 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1398,7 +1398,8 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo, */ dma_resv_for_each_fence(_cursor, bo->base.resv, DMA_RESV_USAGE_BOOKKEEP, f) { - if (amdkfd_fence_check_mm(f, current->mm)) + if (amdkfd_fence_check_mm(f, current->mm) && + !(place->flags & TTM_PL_FLAG_CONTIGUOUS)) return false; } -- 2.43.2
[PATCH v5 5/6] drm/amdkfd: Increase KFD bo restore wait time
TTM allocate contiguous VRAM may takes more than 1 second to evict BOs for larger size RDMA buffer. Because KFD restore bo worker reserves all KFD BOs, then TTM cannot hold the remainning KFD BOs lock to evict them, this causes TTM failed to alloc contiguous VRAM. Increase the KFD restore BO wait time to 2 seconds, long enough for RDMA pin BO to alloc the contiguous VRAM. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index a81ef232fdef..c205e2d3acf9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -698,7 +698,7 @@ struct qcm_process_device { /* KFD Memory Eviction */ /* Approx. wait time before attempting to restore evicted BOs */ -#define PROCESS_RESTORE_TIME_MS 100 +#define PROCESS_RESTORE_TIME_MS 2000 /* Approx. back off time if restore fails due to lack of memory */ #define PROCESS_BACK_OFF_TIME_MS 100 /* Approx. time before evicting the process again */ -- 2.43.2
[PATCH v5 1/6] drm/amdgpu: Support contiguous VRAM allocation
RDMA device with limited scatter-gather ability requires contiguous VRAM buffer allocation for RDMA peer direct support. Add a new KFD alloc memory flag and store as bo alloc flag AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS. When pin this bo to export for RDMA peerdirect access, this will set TTM_PL_FLAG_CONTIFUOUS flag, and ask VRAM buddy allocator to get contiguous VRAM. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 include/uapi/linux/kfd_ioctl.h | 1 + 2 files changed, 5 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 0ae9fd844623..ef9154043757 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1712,6 +1712,10 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( alloc_flags = AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; alloc_flags |= (flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) ? AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0; + + /* For contiguous VRAM allocation */ + if (flags & KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT) + alloc_flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; } xcp_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ? 0 : fpriv->xcp_id; diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 2040a470ddb4..c1394c162d4e 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -407,6 +407,7 @@ struct kfd_ioctl_acquire_vm_args { #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT (1 << 26) #define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED (1 << 25) #define KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT (1 << 24) +#define KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT (1 << 23) /* Allocate memory for later SVM (shared virtual memory) mapping. * -- 2.43.2
[PATCH v5 0/6] Best effort contiguous VRAM allocation
This patch series implement new KFD memory alloc flag for best effort contiguous VRAM allocation, to support peer direct access RDMA device with limited scatter-gather dma capability. v2: rebase on patch ("drm/amdgpu: Modify the contiguous flags behaviour") to avoid adding the new GEM flag v3: add patch 2 to handle sg segment size limit (Christian) v4: remove the buddy block size limit from vram mgr because sg table creation already remove the limit, and resource uses u64 to handle block start, size (Christian) v5: remove patch 7 which is not for upstream, add AMDGPU prefix to the macro name. Philip Yang (6): drm/amdgpu: Support contiguous VRAM allocation drm/amdgpu: Handle sg size limit for contiguous allocation drm/amdgpu: Evict BOs from same process for contiguous allocation drm/amdkfd: Evict BO itself for contiguous allocation drm/amdkfd: Increase KFD bo restore wait time drm/amdkfd: Bump kfd version for contiguous VRAM allocation .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 20 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 12 +-- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +- include/uapi/linux/kfd_ioctl.h| 4 +++- 5 files changed, 31 insertions(+), 10 deletions(-) -- 2.43.2
Re: [PATCH v4 6/7] drm/amdgpu: Skip dma map resource for null RDMA device
On 2024-04-23 09:32, Christian König wrote: Am 23.04.24 um 15:04 schrieb Philip Yang: To test RDMA using dummy driver on the system without NIC/RDMA device, the get/put dma pages pass in null device pointer, skip the dma map/unmap resource and sg table to avoid null pointer access. Well just to make it clear this patch is really a no-go for upstreaming. The RDMA code isn't upstream as far as I know and doing this here is really not a good idea even internally. Right, this change is not needed and not related to upstream, just to minimize the difference with upstream. I will not upstream this patch. Regards, Philip Regards, Christian. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 33 +++- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 6c7133bf51d8..101a85263b53 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -698,12 +698,15 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, unsigned long size = min(cursor.size, MAX_SG_SEGMENT_SIZE); dma_addr_t addr; - addr = dma_map_resource(dev, phys, size, dir, - DMA_ATTR_SKIP_CPU_SYNC); - r = dma_mapping_error(dev, addr); - if (r) - goto error_unmap; - + if (dev) { + addr = dma_map_resource(dev, phys, size, dir, + DMA_ATTR_SKIP_CPU_SYNC); + r = dma_mapping_error(dev, addr); + if (r) + goto error_unmap; + } else { + addr = phys; + } sg_set_page(sg, NULL, size, 0); sg_dma_address(sg) = addr; sg_dma_len(sg) = size; @@ -717,10 +720,10 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, for_each_sgtable_sg((*sgt), sg, i) { if (!sg->length) continue; - - dma_unmap_resource(dev, sg->dma_address, - sg->length, dir, - DMA_ATTR_SKIP_CPU_SYNC); + if (dev) + dma_unmap_resource(dev, sg->dma_address, + sg->length, dir, + DMA_ATTR_SKIP_CPU_SYNC); } sg_free_table(*sgt); @@ -745,10 +748,12 @@ void amdgpu_vram_mgr_free_sgt(struct device *dev, struct scatterlist *sg; int i; - for_each_sgtable_sg(sgt, sg, i) - dma_unmap_resource(dev, sg->dma_address, - sg->length, dir, - DMA_ATTR_SKIP_CPU_SYNC); + if (dev) { + for_each_sgtable_sg(sgt, sg, i) + dma_unmap_resource(dev, sg->dma_address, + sg->length, dir, + DMA_ATTR_SKIP_CPU_SYNC); + } sg_free_table(sgt); kfree(sgt); }
[PATCH v4 1/7] drm/amdgpu: Support contiguous VRAM allocation
RDMA device with limited scatter-gather ability requires contiguous VRAM buffer allocation for RDMA peer direct support. Add a new KFD alloc memory flag and store as bo alloc flag AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS. When pin this bo to export for RDMA peerdirect access, this will set TTM_PL_FLAG_CONTIFUOUS flag, and ask VRAM buddy allocator to get contiguous VRAM. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 include/uapi/linux/kfd_ioctl.h | 1 + 2 files changed, 5 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 0ae9fd844623..ef9154043757 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1712,6 +1712,10 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( alloc_flags = AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; alloc_flags |= (flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) ? AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0; + + /* For contiguous VRAM allocation */ + if (flags & KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT) + alloc_flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; } xcp_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ? 0 : fpriv->xcp_id; diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 2040a470ddb4..c1394c162d4e 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -407,6 +407,7 @@ struct kfd_ioctl_acquire_vm_args { #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT (1 << 26) #define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED (1 << 25) #define KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT (1 << 24) +#define KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT (1 << 23) /* Allocate memory for later SVM (shared virtual memory) mapping. * -- 2.43.2
[PATCH v4 3/7] drm/amdgpu: Evict BOs from same process for contiguous allocation
When TTM failed to alloc VRAM, TTM try evict BOs from VRAM to system memory then retry the allocation, this skips the KFD BOs from the same process because KFD require all BOs are resident for user queues. If TTM with TTM_PL_FLAG_CONTIGUOUS flag to alloc contiguous VRAM, allow TTM evict KFD BOs from the same process, this will evict the user queues first, and restore the queues later after contiguous VRAM allocation. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 851509c6e90e..c907d6005641 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1398,7 +1398,8 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo, */ dma_resv_for_each_fence(_cursor, bo->base.resv, DMA_RESV_USAGE_BOOKKEEP, f) { - if (amdkfd_fence_check_mm(f, current->mm)) + if (amdkfd_fence_check_mm(f, current->mm) && + !(place->flags & TTM_PL_FLAG_CONTIGUOUS)) return false; } -- 2.43.2
[PATCH v4 7/7] drm/amdkfd: Bump kfd version for contiguous VRAM allocation
Bump the kfd ioctl minor version to delcare the contiguous VRAM allocation flag support. Signed-off-by: Philip Yang --- include/uapi/linux/kfd_ioctl.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index c1394c162d4e..a5ebbe98ff7f 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -41,9 +41,10 @@ * - 1.13 - Add debugger API * - 1.14 - Update kfd_event_data * - 1.15 - Enable managing mappings in compute VMs with GEM_VA ioctl + * - 1.16 - Add contiguous VRAM allocation flag */ #define KFD_IOCTL_MAJOR_VERSION 1 -#define KFD_IOCTL_MINOR_VERSION 15 +#define KFD_IOCTL_MINOR_VERSION 16 struct kfd_ioctl_get_version_args { __u32 major_version;/* from KFD */ -- 2.43.2
[PATCH v4 4/7] drm/amdkfd: Evict BO itself for contiguous allocation
If the BO pages pinned for RDMA is not contiguous on VRAM, evict it to system memory first to free the VRAM space, then allocate contiguous VRAM space, and then move it from system memory back to VRAM. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 +++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index ef9154043757..5d118e5580ce 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1470,13 +1470,27 @@ static int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo *bo, u32 domain) if (unlikely(ret)) return ret; + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) { + /* +* If bo is not contiguous on VRAM, move to system memory first to ensure +* we can get contiguous VRAM space after evicting other BOs. +*/ + if (!(bo->tbo.resource->placement & TTM_PL_FLAG_CONTIGUOUS)) { + ret = amdgpu_amdkfd_bo_validate(bo, AMDGPU_GEM_DOMAIN_GTT, false); + if (unlikely(ret)) { + pr_debug("validate bo 0x%p to GTT failed %d\n", >tbo, ret); + goto out; + } + } + } + ret = amdgpu_bo_pin_restricted(bo, domain, 0, 0); if (ret) pr_err("Error in Pinning BO to domain: %d\n", domain); amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false); +out: amdgpu_bo_unreserve(bo); - return ret; } -- 2.43.2
[PATCH v4 6/7] drm/amdgpu: Skip dma map resource for null RDMA device
To test RDMA using dummy driver on the system without NIC/RDMA device, the get/put dma pages pass in null device pointer, skip the dma map/unmap resource and sg table to avoid null pointer access. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 33 +++- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 6c7133bf51d8..101a85263b53 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -698,12 +698,15 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, unsigned long size = min(cursor.size, MAX_SG_SEGMENT_SIZE); dma_addr_t addr; - addr = dma_map_resource(dev, phys, size, dir, - DMA_ATTR_SKIP_CPU_SYNC); - r = dma_mapping_error(dev, addr); - if (r) - goto error_unmap; - + if (dev) { + addr = dma_map_resource(dev, phys, size, dir, + DMA_ATTR_SKIP_CPU_SYNC); + r = dma_mapping_error(dev, addr); + if (r) + goto error_unmap; + } else { + addr = phys; + } sg_set_page(sg, NULL, size, 0); sg_dma_address(sg) = addr; sg_dma_len(sg) = size; @@ -717,10 +720,10 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, for_each_sgtable_sg((*sgt), sg, i) { if (!sg->length) continue; - - dma_unmap_resource(dev, sg->dma_address, - sg->length, dir, - DMA_ATTR_SKIP_CPU_SYNC); + if (dev) + dma_unmap_resource(dev, sg->dma_address, + sg->length, dir, + DMA_ATTR_SKIP_CPU_SYNC); } sg_free_table(*sgt); @@ -745,10 +748,12 @@ void amdgpu_vram_mgr_free_sgt(struct device *dev, struct scatterlist *sg; int i; - for_each_sgtable_sg(sgt, sg, i) - dma_unmap_resource(dev, sg->dma_address, - sg->length, dir, - DMA_ATTR_SKIP_CPU_SYNC); + if (dev) { + for_each_sgtable_sg(sgt, sg, i) + dma_unmap_resource(dev, sg->dma_address, + sg->length, dir, + DMA_ATTR_SKIP_CPU_SYNC); + } sg_free_table(sgt); kfree(sgt); } -- 2.43.2
[PATCH v4 5/7] drm/amdkfd: Increase KFD bo restore wait time
TTM allocate contiguous VRAM may takes more than 1 second to evict BOs for larger size RDMA buffer. Because KFD restore bo worker reserves all KFD BOs, then TTM cannot hold the remainning KFD BOs lock to evict them, this causes TTM failed to alloc contiguous VRAM. Increase the KFD restore BO wait time to 2 seconds, long enough for RDMA pin BO to alloc the contiguous VRAM. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index a81ef232fdef..c205e2d3acf9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -698,7 +698,7 @@ struct qcm_process_device { /* KFD Memory Eviction */ /* Approx. wait time before attempting to restore evicted BOs */ -#define PROCESS_RESTORE_TIME_MS 100 +#define PROCESS_RESTORE_TIME_MS 2000 /* Approx. back off time if restore fails due to lack of memory */ #define PROCESS_BACK_OFF_TIME_MS 100 /* Approx. time before evicting the process again */ -- 2.43.2
[PATCH v4 2/7] drm/amdgpu: Handle sg size limit for contiguous allocation
Define macro MAX_SG_SEGMENT_SIZE 2GB, because struct scatterlist length is unsigned int, and some users of it cast to a signed int, so every segment of sg table is limited to size 2GB maximum. For contiguous VRAM allocation, don't limit the max buddy block size in order to get contiguous VRAM memory. To workaround the sg table segment size limit, allocate multiple segments if contiguous size is bigger than MAX_SG_SEGMENT_SIZE. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 4be8b091099a..6c7133bf51d8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -31,6 +31,8 @@ #include "amdgpu_atomfirmware.h" #include "atom.h" +#define MAX_SG_SEGMENT_SIZE(2UL << 30) + struct amdgpu_vram_reservation { u64 start; u64 size; @@ -532,9 +534,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, BUG_ON(min_block_size < mm->chunk_size); - /* Limit maximum size to 2GiB due to SG table limitations */ - size = min(remaining_size, 2ULL << 30); - + size = remaining_size; if ((size >= (u64)pages_per_block << PAGE_SHIFT) && !(size & (((u64)pages_per_block << PAGE_SHIFT) - 1))) min_block_size = (u64)pages_per_block << PAGE_SHIFT; @@ -675,7 +675,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, amdgpu_res_first(res, offset, length, ); while (cursor.remaining) { num_entries++; - amdgpu_res_next(, cursor.size); + amdgpu_res_next(, min(cursor.size, MAX_SG_SEGMENT_SIZE)); } r = sg_alloc_table(*sgt, num_entries, GFP_KERNEL); @@ -695,7 +695,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, amdgpu_res_first(res, offset, length, ); for_each_sgtable_sg((*sgt), sg, i) { phys_addr_t phys = cursor.start + adev->gmc.aper_base; - size_t size = cursor.size; + unsigned long size = min(cursor.size, MAX_SG_SEGMENT_SIZE); dma_addr_t addr; addr = dma_map_resource(dev, phys, size, dir, @@ -708,7 +708,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, sg_dma_address(sg) = addr; sg_dma_len(sg) = size; - amdgpu_res_next(, cursor.size); + amdgpu_res_next(, size); } return 0; -- 2.43.2
[PATCH v4 0/7] Best effort contiguous VRAM allocation
This patch series implement new KFD memory alloc flag for best effort contiguous VRAM allocation, to support peer direct access RDMA device with limited scatter-gather dma capability. v2: rebase on patch ("drm/amdgpu: Modify the contiguous flags behaviour") to avoid adding the new GEM flag v3: add patch 2 to handle sg segment size limit (Christian) v4: remove the buddy block size limit from vram mgr because sg table creation already remove the limit, and resource uses u64 to handle block start, size (Christian) Philip Yang (7): drm/amdgpu: Support contiguous VRAM allocation drm/amdgpu: Handle sg size limit for contiguous allocation drm/amdgpu: Evict BOs from same process for contiguous allocation drm/amdkfd: Evict BO itself for contiguous allocation drm/amdkfd: Increase KFD bo restore wait time drm/amdgpu: Skip dma map resource for null RDMA device drm/amdkfd: Bump kfd version for contiguous VRAM allocation .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 20 - drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 45 ++- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +- include/uapi/linux/kfd_ioctl.h| 4 +- 5 files changed, 50 insertions(+), 24 deletions(-) -- 2.43.2
Re: [PATCH v3 6/7] drm/amdgpu: Skip dma map resource for null RDMA device
On 2024-04-22 10:56, Christian König wrote: Am 22.04.24 um 15:57 schrieb Philip Yang: To test RDMA using dummy driver on the system without NIC/RDMA device, the get/put dma pages pass in null device pointer, skip the dma map/unmap resource and sg table to avoid null pointer access. Well that is completely illegal and would break IOMMU. Why does the RDMA driver does that in the first place? That is the amdp2ptest driver, part of KFDTest rdma test. The simple rdma test app and driver is used to test the driver path, without actually transferring data b/w machines. Regards, Philip Regards, Christian. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 33 +++- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 9fe56a21ef88..0caf2c89ef1d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -705,12 +705,15 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, unsigned long size = min(cursor.size, MAX_SG_SEGMENT_SIZE); dma_addr_t addr; - addr = dma_map_resource(dev, phys, size, dir, - DMA_ATTR_SKIP_CPU_SYNC); - r = dma_mapping_error(dev, addr); - if (r) - goto error_unmap; - + if (dev) { + addr = dma_map_resource(dev, phys, size, dir, + DMA_ATTR_SKIP_CPU_SYNC); + r = dma_mapping_error(dev, addr); + if (r) + goto error_unmap; + } else { + addr = phys; + } sg_set_page(sg, NULL, size, 0); sg_dma_address(sg) = addr; sg_dma_len(sg) = size; @@ -724,10 +727,10 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, for_each_sgtable_sg((*sgt), sg, i) { if (!sg->length) continue; - - dma_unmap_resource(dev, sg->dma_address, - sg->length, dir, - DMA_ATTR_SKIP_CPU_SYNC); + if (dev) + dma_unmap_resource(dev, sg->dma_address, + sg->length, dir, + DMA_ATTR_SKIP_CPU_SYNC); } sg_free_table(*sgt); @@ -752,10 +755,12 @@ void amdgpu_vram_mgr_free_sgt(struct device *dev, struct scatterlist *sg; int i; - for_each_sgtable_sg(sgt, sg, i) - dma_unmap_resource(dev, sg->dma_address, - sg->length, dir, - DMA_ATTR_SKIP_CPU_SYNC); + if (dev) { + for_each_sgtable_sg(sgt, sg, i) + dma_unmap_resource(dev, sg->dma_address, + sg->length, dir, + DMA_ATTR_SKIP_CPU_SYNC); + } sg_free_table(sgt); kfree(sgt); }
Re: [PATCH v3 2/7] drm/amdgpu: Handle sg size limit for contiguous allocation
On 2024-04-22 10:40, Christian König wrote: Am 22.04.24 um 15:57 schrieb Philip Yang: Define macro MAX_SG_SEGMENT_SIZE 2GB, because struct scatterlist length is unsigned int, and some users of it cast to a signed int, so every segment of sg table is limited to size 2GB maximum. For contiguous VRAM allocation, don't limit the max buddy block size in order to get contiguous VRAM memory. To workaround the sg table segment size limit, allocate multiple segments if contiguous size is bigger than MAX_SG_SEGMENT_SIZE. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 17 - 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 4be8b091099a..9fe56a21ef88 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -31,6 +31,8 @@ #include "amdgpu_atomfirmware.h" #include "atom.h" +#define MAX_SG_SEGMENT_SIZE (2UL << 30) + struct amdgpu_vram_reservation { u64 start; u64 size; @@ -532,8 +534,13 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, BUG_ON(min_block_size < mm->chunk_size); - /* Limit maximum size to 2GiB due to SG table limitations */ - size = min(remaining_size, 2ULL << 30); + if (place->flags & TTM_PL_FLAG_CONTIGUOUS) + size = remaining_size; + else + /* Limit maximum size to 2GiB due to SG table limitations + * for no contiguous allocation. + */ + size = min(remaining_size, MAX_SG_SEGMENT_SIZE); Well that doesn't make sense, either fix the creation of the sg tables or limit the segment size. Not both. yes, right. we don't need limit the segment size for non-contiguous allocation either as this is handled by min_block_size. I will send v4 patch to fix this. Then we could have another patch to remove the while loop, size and remaining size to simply the code in future. Regards, Philip if ((size >= (u64)pages_per_block << PAGE_SHIFT) && !(size & (((u64)pages_per_block << PAGE_SHIFT) - 1))) @@ -675,7 +682,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, amdgpu_res_first(res, offset, length, ); while (cursor.remaining) { num_entries++; - amdgpu_res_next(, cursor.size); + amdgpu_res_next(, min(cursor.size, MAX_SG_SEGMENT_SIZE)); } r = sg_alloc_table(*sgt, num_entries, GFP_KERNEL); @@ -695,7 +702,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, amdgpu_res_first(res, offset, length, ); for_each_sgtable_sg((*sgt), sg, i) { phys_addr_t phys = cursor.start + adev->gmc.aper_base; - size_t size = cursor.size; + unsigned long size = min(cursor.size, MAX_SG_SEGMENT_SIZE); Please keep size_t here or use unsigned int, using unsigned long just looks like trying to hide the problem. And I wouldn't use a separate define but rather just INT_MAX instead. Regards, Christian. dma_addr_t addr; addr = dma_map_resource(dev, phys, size, dir, @@ -708,7 +715,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, sg_dma_address(sg) = addr; sg_dma_len(sg) = size; - amdgpu_res_next(, cursor.size); + amdgpu_res_next(, size); } return 0;
[PATCH v3 6/7] drm/amdgpu: Skip dma map resource for null RDMA device
To test RDMA using dummy driver on the system without NIC/RDMA device, the get/put dma pages pass in null device pointer, skip the dma map/unmap resource and sg table to avoid null pointer access. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 33 +++- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 9fe56a21ef88..0caf2c89ef1d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -705,12 +705,15 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, unsigned long size = min(cursor.size, MAX_SG_SEGMENT_SIZE); dma_addr_t addr; - addr = dma_map_resource(dev, phys, size, dir, - DMA_ATTR_SKIP_CPU_SYNC); - r = dma_mapping_error(dev, addr); - if (r) - goto error_unmap; - + if (dev) { + addr = dma_map_resource(dev, phys, size, dir, + DMA_ATTR_SKIP_CPU_SYNC); + r = dma_mapping_error(dev, addr); + if (r) + goto error_unmap; + } else { + addr = phys; + } sg_set_page(sg, NULL, size, 0); sg_dma_address(sg) = addr; sg_dma_len(sg) = size; @@ -724,10 +727,10 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, for_each_sgtable_sg((*sgt), sg, i) { if (!sg->length) continue; - - dma_unmap_resource(dev, sg->dma_address, - sg->length, dir, - DMA_ATTR_SKIP_CPU_SYNC); + if (dev) + dma_unmap_resource(dev, sg->dma_address, + sg->length, dir, + DMA_ATTR_SKIP_CPU_SYNC); } sg_free_table(*sgt); @@ -752,10 +755,12 @@ void amdgpu_vram_mgr_free_sgt(struct device *dev, struct scatterlist *sg; int i; - for_each_sgtable_sg(sgt, sg, i) - dma_unmap_resource(dev, sg->dma_address, - sg->length, dir, - DMA_ATTR_SKIP_CPU_SYNC); + if (dev) { + for_each_sgtable_sg(sgt, sg, i) + dma_unmap_resource(dev, sg->dma_address, + sg->length, dir, + DMA_ATTR_SKIP_CPU_SYNC); + } sg_free_table(sgt); kfree(sgt); } -- 2.43.2
[PATCH v3 0/7] Best effort contiguous VRAM allocation
This patch series implement new KFD memory alloc flag for best effort contiguous VRAM allocation, to support peer direct access RDMA device with limited scatter-gather dma capability. v2: rebase on patch ("drm/amdgpu: Modify the contiguous flags behaviour") to avoid adding the new GEM flag v3: add patch 2 to handle sg segment size limit (Christian) Philip Yang (7): drm/amdgpu: Support contiguous VRAM allocation drm/amdgpu: Handle sg size limit for contiguous allocation drm/amdgpu: Evict BOs from same process for contiguous allocation drm/amdkfd: Evict BO itself for contiguous allocation drm/amdkfd: Increase KFD bo restore wait time drm/amdgpu: Skip dma map resource for null RDMA device drm/amdkfd: Bump kfd version for contiguous VRAM allocation .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 20 +++- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 50 --- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +- include/uapi/linux/kfd_ioctl.h| 4 +- 5 files changed, 56 insertions(+), 23 deletions(-) -- 2.43.2
[PATCH v3 5/7] drm/amdkfd: Increase KFD bo restore wait time
TTM allocate contiguous VRAM may takes more than 1 second to evict BOs for larger size RDMA buffer. Because KFD restore bo worker reserves all KFD BOs, then TTM cannot hold the remainning KFD BOs lock to evict them, this causes TTM failed to alloc contiguous VRAM. Increase the KFD restore BO wait time to 2 seconds, long enough for RDMA pin BO to alloc the contiguous VRAM. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index a81ef232fdef..c205e2d3acf9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -698,7 +698,7 @@ struct qcm_process_device { /* KFD Memory Eviction */ /* Approx. wait time before attempting to restore evicted BOs */ -#define PROCESS_RESTORE_TIME_MS 100 +#define PROCESS_RESTORE_TIME_MS 2000 /* Approx. back off time if restore fails due to lack of memory */ #define PROCESS_BACK_OFF_TIME_MS 100 /* Approx. time before evicting the process again */ -- 2.43.2
[PATCH v3 7/7] drm/amdkfd: Bump kfd version for contiguous VRAM allocation
Bump the kfd ioctl minor version to delcare the contiguous VRAM allocation flag support. Signed-off-by: Philip Yang --- include/uapi/linux/kfd_ioctl.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index c1394c162d4e..a5ebbe98ff7f 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -41,9 +41,10 @@ * - 1.13 - Add debugger API * - 1.14 - Update kfd_event_data * - 1.15 - Enable managing mappings in compute VMs with GEM_VA ioctl + * - 1.16 - Add contiguous VRAM allocation flag */ #define KFD_IOCTL_MAJOR_VERSION 1 -#define KFD_IOCTL_MINOR_VERSION 15 +#define KFD_IOCTL_MINOR_VERSION 16 struct kfd_ioctl_get_version_args { __u32 major_version;/* from KFD */ -- 2.43.2
[PATCH v3 3/7] drm/amdgpu: Evict BOs from same process for contiguous allocation
When TTM failed to alloc VRAM, TTM try evict BOs from VRAM to system memory then retry the allocation, this skips the KFD BOs from the same process because KFD require all BOs are resident for user queues. If TTM with TTM_PL_FLAG_CONTIGUOUS flag to alloc contiguous VRAM, allow TTM evict KFD BOs from the same process, this will evict the user queues first, and restore the queues later after contiguous VRAM allocation. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 851509c6e90e..c907d6005641 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1398,7 +1398,8 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo, */ dma_resv_for_each_fence(_cursor, bo->base.resv, DMA_RESV_USAGE_BOOKKEEP, f) { - if (amdkfd_fence_check_mm(f, current->mm)) + if (amdkfd_fence_check_mm(f, current->mm) && + !(place->flags & TTM_PL_FLAG_CONTIGUOUS)) return false; } -- 2.43.2
[PATCH v3 2/7] drm/amdgpu: Handle sg size limit for contiguous allocation
Define macro MAX_SG_SEGMENT_SIZE 2GB, because struct scatterlist length is unsigned int, and some users of it cast to a signed int, so every segment of sg table is limited to size 2GB maximum. For contiguous VRAM allocation, don't limit the max buddy block size in order to get contiguous VRAM memory. To workaround the sg table segment size limit, allocate multiple segments if contiguous size is bigger than MAX_SG_SEGMENT_SIZE. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 17 - 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 4be8b091099a..9fe56a21ef88 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -31,6 +31,8 @@ #include "amdgpu_atomfirmware.h" #include "atom.h" +#define MAX_SG_SEGMENT_SIZE(2UL << 30) + struct amdgpu_vram_reservation { u64 start; u64 size; @@ -532,8 +534,13 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, BUG_ON(min_block_size < mm->chunk_size); - /* Limit maximum size to 2GiB due to SG table limitations */ - size = min(remaining_size, 2ULL << 30); + if (place->flags & TTM_PL_FLAG_CONTIGUOUS) + size = remaining_size; + else + /* Limit maximum size to 2GiB due to SG table limitations +* for no contiguous allocation. +*/ + size = min(remaining_size, MAX_SG_SEGMENT_SIZE); if ((size >= (u64)pages_per_block << PAGE_SHIFT) && !(size & (((u64)pages_per_block << PAGE_SHIFT) - 1))) @@ -675,7 +682,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, amdgpu_res_first(res, offset, length, ); while (cursor.remaining) { num_entries++; - amdgpu_res_next(, cursor.size); + amdgpu_res_next(, min(cursor.size, MAX_SG_SEGMENT_SIZE)); } r = sg_alloc_table(*sgt, num_entries, GFP_KERNEL); @@ -695,7 +702,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, amdgpu_res_first(res, offset, length, ); for_each_sgtable_sg((*sgt), sg, i) { phys_addr_t phys = cursor.start + adev->gmc.aper_base; - size_t size = cursor.size; + unsigned long size = min(cursor.size, MAX_SG_SEGMENT_SIZE); dma_addr_t addr; addr = dma_map_resource(dev, phys, size, dir, @@ -708,7 +715,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, sg_dma_address(sg) = addr; sg_dma_len(sg) = size; - amdgpu_res_next(, cursor.size); + amdgpu_res_next(, size); } return 0; -- 2.43.2
[PATCH v3 4/7] drm/amdkfd: Evict BO itself for contiguous allocation
If the BO pages pinned for RDMA is not contiguous on VRAM, evict it to system memory first to free the VRAM space, then allocate contiguous VRAM space, and then move it from system memory back to VRAM. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 +++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index ef9154043757..5d118e5580ce 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1470,13 +1470,27 @@ static int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo *bo, u32 domain) if (unlikely(ret)) return ret; + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) { + /* +* If bo is not contiguous on VRAM, move to system memory first to ensure +* we can get contiguous VRAM space after evicting other BOs. +*/ + if (!(bo->tbo.resource->placement & TTM_PL_FLAG_CONTIGUOUS)) { + ret = amdgpu_amdkfd_bo_validate(bo, AMDGPU_GEM_DOMAIN_GTT, false); + if (unlikely(ret)) { + pr_debug("validate bo 0x%p to GTT failed %d\n", >tbo, ret); + goto out; + } + } + } + ret = amdgpu_bo_pin_restricted(bo, domain, 0, 0); if (ret) pr_err("Error in Pinning BO to domain: %d\n", domain); amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false); +out: amdgpu_bo_unreserve(bo); - return ret; } -- 2.43.2
[PATCH v3 1/7] drm/amdgpu: Support contiguous VRAM allocation
RDMA device with limited scatter-gather ability requires contiguous VRAM buffer allocation for RDMA peer direct support. Add a new KFD alloc memory flag and store as bo alloc flag AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS. When pin this bo to export for RDMA peerdirect access, this will set TTM_PL_FLAG_CONTIFUOUS flag, and ask VRAM buddy allocator to get contiguous VRAM. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 include/uapi/linux/kfd_ioctl.h | 1 + 2 files changed, 5 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 0ae9fd844623..ef9154043757 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1712,6 +1712,10 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( alloc_flags = AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; alloc_flags |= (flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) ? AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0; + + /* For contiguous VRAM allocation */ + if (flags & KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT) + alloc_flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; } xcp_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ? 0 : fpriv->xcp_id; diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 2040a470ddb4..c1394c162d4e 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -407,6 +407,7 @@ struct kfd_ioctl_acquire_vm_args { #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT (1 << 26) #define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED (1 << 25) #define KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT (1 << 24) +#define KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT (1 << 23) /* Allocate memory for later SVM (shared virtual memory) mapping. * -- 2.43.2
Re: [PATCH] drm/amdkfd: Fix rescheduling of restore worker
On 2024-04-19 15:00, Felix Kuehling wrote: Handle the case that the restore worker was already scheduled by another eviction while the restore was in progress. Fixes: 9a1c1339abf9 ("drm/amdkfd: Run restore_workers on freezable WQs") Signed-off-by: Felix Kuehling Reviewed-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index aafdf064651f..58c1fe542193 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -2012,9 +2012,9 @@ static void restore_process_worker(struct work_struct *work) if (ret) { pr_debug("Failed to restore BOs of pasid 0x%x, retry after %d ms\n", p->pasid, PROCESS_BACK_OFF_TIME_MS); - ret = queue_delayed_work(kfd_restore_wq, >restore_work, -msecs_to_jiffies(PROCESS_BACK_OFF_TIME_MS)); - WARN(!ret, "reschedule restore work failed\n"); + if (mod_delayed_work(kfd_restore_wq, >restore_work, + msecs_to_jiffies(PROCESS_RESTORE_TIME_MS))) + kfd_process_restore_queues(p); } }
Re: [PATCH v2 1/6] drm/amdgpu: Support contiguous VRAM allocation
On 2024-04-18 10:37, Christian König wrote: Am 18.04.24 um 15:57 schrieb Philip Yang: RDMA device with limited scatter-gather ability requires contiguous VRAM buffer allocation for RDMA peer direct support. Add a new KFD alloc memory flag and store as bo alloc flag AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS. When pin this bo to export for RDMA peerdirect access, this will set TTM_PL_FLAG_CONTIFUOUS flag, and ask VRAM buddy allocator to get contiguous VRAM. Remove the 2GB max memory block size limit for contiguous allocation. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 9 +++-- include/uapi/linux/kfd_ioctl.h | 1 + 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 0ae9fd844623..ef9154043757 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1712,6 +1712,10 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( alloc_flags = AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; alloc_flags |= (flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) ? AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0; + + /* For contiguous VRAM allocation */ + if (flags & KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT) + alloc_flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; } xcp_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ? 0 : fpriv->xcp_id; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 4be8b091099a..2f2ae711 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -532,8 +532,13 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, BUG_ON(min_block_size < mm->chunk_size); - /* Limit maximum size to 2GiB due to SG table limitations */ - size = min(remaining_size, 2ULL << 30); + if (place->flags & TTM_PL_FLAG_CONTIGUOUS) + size = remaining_size; + else + /* Limit maximum size to 2GiB due to SG table limitations + * for no contiguous allocation. + */ + size = min(remaining_size, 2ULL << 30); Oh, I totally missed this in the first review. That won't work like that the sg table limit is still there even if the BO is contiguous. We could only fix up the VRAM P2P support to use multiple segments in the sg table. yes, you are right, I didn't test with buffer size > 4GB, struct scatterlist->offset, length is unsigned int, this limits each sg_table entry size < 4GB. I will do more testing, we should still get >4GB contiguous VRAM, will add another patch to fix it inside amdgpu_vram_mgr_alloc_sgt, to split it into multiple sg_table entries, and RDMA peerdirect app should be able to handle this case based on sg_table->nents. Regards, Philip Regards, Christian. if ((size >= (u64)pages_per_block << PAGE_SHIFT) && !(size & (((u64)pages_per_block << PAGE_SHIFT) - 1))) diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 2040a470ddb4..c1394c162d4e 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -407,6 +407,7 @@ struct kfd_ioctl_acquire_vm_args { #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT (1 << 26) #define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED (1 << 25)
Re: [PATCH] drm/amdkfd: Fix eviction fence handling
On 2024-04-17 23:14, Felix Kuehling wrote: Handle case that dma_fence_get_rcu_safe returns NULL. If restore work is already scheduled, only update its timer. The same work item cannot be queued twice, so undo the extra queue eviction. Fixes: 9a1c1339abf9 ("drm/amdkfd: Run restore_workers on freezable WQs") Signed-off-by: Felix Kuehling Reviewed-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 9 + 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index b79986412cd8..aafdf064651f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -1922,6 +1922,8 @@ static int signal_eviction_fence(struct kfd_process *p) rcu_read_lock(); ef = dma_fence_get_rcu_safe(>ef); rcu_read_unlock(); + if (!ef) + return -EINVAL; ret = dma_fence_signal(ef); dma_fence_put(ef); @@ -1949,10 +1951,9 @@ static void evict_process_worker(struct work_struct *work) * they are responsible stopping the queues and scheduling * the restore work. */ - if (!signal_eviction_fence(p)) - queue_delayed_work(kfd_restore_wq, >restore_work, -msecs_to_jiffies(PROCESS_RESTORE_TIME_MS)); - else + if (signal_eviction_fence(p) || + mod_delayed_work(kfd_restore_wq, >restore_work, + msecs_to_jiffies(PROCESS_RESTORE_TIME_MS))) kfd_process_restore_queues(p); pr_debug("Finished evicting pasid 0x%x\n", p->pasid);
[PATCH v2 3/6] drm/amdkfd: Evict BO itself for contiguous allocation
If the BO pages pinned for RDMA is not contiguous on VRAM, evict it to system memory first to free the VRAM space, then allocate contiguous VRAM space, and then move it from system memory back to VRAM. Signed-off-by: Philip Yang --- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c| 17 - 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index ef9154043757..ff7f54741661 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1470,13 +1470,28 @@ static int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo *bo, u32 domain) if (unlikely(ret)) return ret; + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) { + /* +* If bo is not contiguous on VRAM, move to system memory first to ensure +* we can get contiguous VRAM space after evicting other BOs. +*/ + if (!(bo->tbo.resource->placement & TTM_PL_FLAG_CONTIGUOUS)) { + ret = amdgpu_amdkfd_bo_validate(bo, AMDGPU_GEM_DOMAIN_GTT, false); + if (unlikely(ret)) { + pr_debug("validate bo 0x%p to GTT failed %d\n", >tbo, ret); + goto out; + } + } + } + ret = amdgpu_bo_pin_restricted(bo, domain, 0, 0); if (ret) pr_err("Error in Pinning BO to domain: %d\n", domain); amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false); - amdgpu_bo_unreserve(bo); +out: + amdgpu_bo_unreserve(bo); return ret; } -- 2.43.2
[PATCH v2 1/6] drm/amdgpu: Support contiguous VRAM allocation
RDMA device with limited scatter-gather ability requires contiguous VRAM buffer allocation for RDMA peer direct support. Add a new KFD alloc memory flag and store as bo alloc flag AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS. When pin this bo to export for RDMA peerdirect access, this will set TTM_PL_FLAG_CONTIFUOUS flag, and ask VRAM buddy allocator to get contiguous VRAM. Remove the 2GB max memory block size limit for contiguous allocation. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 9 +++-- include/uapi/linux/kfd_ioctl.h | 1 + 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 0ae9fd844623..ef9154043757 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1712,6 +1712,10 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( alloc_flags = AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; alloc_flags |= (flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) ? AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0; + + /* For contiguous VRAM allocation */ + if (flags & KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT) + alloc_flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; } xcp_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ? 0 : fpriv->xcp_id; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 4be8b091099a..2f2ae711 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -532,8 +532,13 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, BUG_ON(min_block_size < mm->chunk_size); - /* Limit maximum size to 2GiB due to SG table limitations */ - size = min(remaining_size, 2ULL << 30); + if (place->flags & TTM_PL_FLAG_CONTIGUOUS) + size = remaining_size; + else + /* Limit maximum size to 2GiB due to SG table limitations +* for no contiguous allocation. +*/ + size = min(remaining_size, 2ULL << 30); if ((size >= (u64)pages_per_block << PAGE_SHIFT) && !(size & (((u64)pages_per_block << PAGE_SHIFT) - 1))) diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 2040a470ddb4..c1394c162d4e 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -407,6 +407,7 @@ struct kfd_ioctl_acquire_vm_args { #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT (1 << 26) #define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED (1 << 25) #define KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT (1 << 24) +#define KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT (1 << 23) /* Allocate memory for later SVM (shared virtual memory) mapping. * -- 2.43.2
[PATCH v2 5/6] drm/amdgpu: Skip dma map resource for null RDMA device
To test RDMA using dummy driver on the system without NIC/RDMA device, the get/put dma pages pass in null device pointer, skip the dma map/unmap resource to avoid null pointer access. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 33 +++- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 2f2ae711..4c512a372ec7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -703,12 +703,15 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, size_t size = cursor.size; dma_addr_t addr; - addr = dma_map_resource(dev, phys, size, dir, - DMA_ATTR_SKIP_CPU_SYNC); - r = dma_mapping_error(dev, addr); - if (r) - goto error_unmap; - + if (dev) { + addr = dma_map_resource(dev, phys, size, dir, + DMA_ATTR_SKIP_CPU_SYNC); + r = dma_mapping_error(dev, addr); + if (r) + goto error_unmap; + } else { + addr = phys; + } sg_set_page(sg, NULL, size, 0); sg_dma_address(sg) = addr; sg_dma_len(sg) = size; @@ -722,10 +725,10 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, for_each_sgtable_sg((*sgt), sg, i) { if (!sg->length) continue; - - dma_unmap_resource(dev, sg->dma_address, - sg->length, dir, - DMA_ATTR_SKIP_CPU_SYNC); + if (dev) + dma_unmap_resource(dev, sg->dma_address, + sg->length, dir, + DMA_ATTR_SKIP_CPU_SYNC); } sg_free_table(*sgt); @@ -750,10 +753,12 @@ void amdgpu_vram_mgr_free_sgt(struct device *dev, struct scatterlist *sg; int i; - for_each_sgtable_sg(sgt, sg, i) - dma_unmap_resource(dev, sg->dma_address, - sg->length, dir, - DMA_ATTR_SKIP_CPU_SYNC); + if (dev) { + for_each_sgtable_sg(sgt, sg, i) + dma_unmap_resource(dev, sg->dma_address, + sg->length, dir, + DMA_ATTR_SKIP_CPU_SYNC); + } sg_free_table(sgt); kfree(sgt); } -- 2.43.2
[PATCH v2 6/6] drm/amdkfd: Bump kfd version for contiguous VRAM allocation
Bump the kfd ioctl minor version to delcare the contiguous VRAM allocation flag support. Signed-off-by: Philip Yang --- include/uapi/linux/kfd_ioctl.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index c1394c162d4e..a0af2ef696ea 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -41,9 +41,10 @@ * - 1.13 - Add debugger API * - 1.14 - Update kfd_event_data * - 1.15 - Enable managing mappings in compute VMs with GEM_VA ioctl + * - 1.16 - Add contiguous VRAM allocation flag for RDMA */ #define KFD_IOCTL_MAJOR_VERSION 1 -#define KFD_IOCTL_MINOR_VERSION 15 +#define KFD_IOCTL_MINOR_VERSION 16 struct kfd_ioctl_get_version_args { __u32 major_version;/* from KFD */ -- 2.43.2
[PATCH v2 4/6] drm/amdkfd: Increase KFD bo restore wait time
TTM allocate contiguous VRAM may takes more than 1 second to evict BOs for larger size RDMA buffer. Because KFD restore bo worker reserves all KFD BOs, then TTM cannot hold the remainning KFD BOs lock to evict them, this causes TTM failed to alloc contiguous VRAM. Increase the KFD restore BO wait time to 2 seconds, long enough for RDMA pin BO to alloc the contiguous VRAM. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index a81ef232fdef..c205e2d3acf9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -698,7 +698,7 @@ struct qcm_process_device { /* KFD Memory Eviction */ /* Approx. wait time before attempting to restore evicted BOs */ -#define PROCESS_RESTORE_TIME_MS 100 +#define PROCESS_RESTORE_TIME_MS 2000 /* Approx. back off time if restore fails due to lack of memory */ #define PROCESS_BACK_OFF_TIME_MS 100 /* Approx. time before evicting the process again */ -- 2.43.2
[PATCH v2 2/6] drm/amdgpu: Evict BOs from same process for contiguous allocation
When TTM failed to alloc VRAM, TTM try evict BOs from VRAM to system memory then retry the allocation, this skips the KFD BOs from the same process because KFD require all BOs are resident for user queues. If TTM with TTM_PL_FLAG_CONTIGUOUS flag to alloc contiguous VRAM, allow TTM evict KFD BOs from the same process, this will evict the user queues first, and restore the queues later after contiguous VRAM allocation. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 851509c6e90e..c907d6005641 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1398,7 +1398,8 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo, */ dma_resv_for_each_fence(_cursor, bo->base.resv, DMA_RESV_USAGE_BOOKKEEP, f) { - if (amdkfd_fence_check_mm(f, current->mm)) + if (amdkfd_fence_check_mm(f, current->mm) && + !(place->flags & TTM_PL_FLAG_CONTIGUOUS)) return false; } -- 2.43.2
[PATCH v2 0/6] Best effort contiguous VRAM allocation
This patch series implement new KFD memory alloc flag for best effort contiguous VRAM allocation, to support peer direct access RDMA device with limited scatter-gather dma capability. v2: rebase on patch ("drm/amdgpu: Modify the contiguous flags behaviour") to avoid adding the new GEM flag Philip Yang (6): drm/amdgpu: Support contiguous VRAM allocation drm/amdgpu: Evict BOs from same process for contiguous allocation drm/amdkfd: Evict BO itself for contiguous allocation drm/amdkfd: Increase KFD bo restore wait time drm/amdgpu: Skip dma map resource for null RDMA device drm/amdkfd: Bump kfd version for contiguous VRAM allocation .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 21 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 42 --- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +- include/uapi/linux/kfd_ioctl.h| 4 +- 5 files changed, 52 insertions(+), 20 deletions(-) -- 2.43.2
Re: [PATCH v2] drm/amdgpu: Modify the contiguous flags behaviour
On 2024-04-17 10:32, Paneer Selvam, Arunpravin wrote: Hi Christian, On 4/17/2024 6:57 PM, Paneer Selvam, Arunpravin wrote: Hi Christian, On 4/17/2024 12:19 PM, Christian König wrote: Am 17.04.24 um 08:21 schrieb Arunpravin Paneer Selvam: Now we have two flags for contiguous VRAM buffer allocation. If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS, it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the buffer's placement function. This patch will change the default behaviour of the two flags. When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS - This means contiguous is not mandatory. - we will try to allocate the contiguous buffer. Say if the allocation fails, we fallback to allocate the individual pages. When we setTTM_PL_FLAG_CONTIGUOUS - This means contiguous allocation is mandatory. - we are setting this in amdgpu_bo_pin_restricted() before bo validation and check this flag in the vram manager file. - if this is set, we should allocate the buffer pages contiguously. the allocation fails, we return -ENOSPC. v2: - keep the mem_flags and bo->flags check as is(Christian) - place the TTM_PL_FLAG_CONTIGUOUS flag setting into the amdgpu_bo_pin_restricted function placement range iteration loop(Christian) - rename find_pages with amdgpu_vram_mgr_calculate_pages_per_block (Christian) - Keep the kernel BO allocation as is(Christain) - If BO pin vram allocation failed, we need to return -ENOSPC as RDMA cannot work with scattered VRAM pages(Philip) Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 8 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 57 +++- 2 files changed, 50 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 8bc79924d171..caaef7b1df49 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -153,8 +153,10 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain) else places[c].flags |= TTM_PL_FLAG_TOPDOWN; - if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) + if (abo->tbo.type == ttm_bo_type_kernel && + flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) places[c].flags |= TTM_PL_FLAG_CONTIGUOUS; + c++; } @@ -966,6 +968,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, if (!bo->placements[i].lpfn || (lpfn && lpfn < bo->placements[i].lpfn)) bo->placements[i].lpfn = lpfn; + + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && + bo->placements[i].mem_type == TTM_PL_VRAM) + bo->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS; } r = ttm_bo_validate(>tbo, >placement, ); Nice work, up till here that looks exactly right as far as I can see. diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
Re: [PATCH 1/6] drm/amdgpu: Support contiguous VRAM allocation
On 2024-04-15 08:02, Christian König wrote: Am 12.04.24 um 22:12 schrieb Philip Yang: RDMA device with limited scatter-gather capability requires physical address contiguous VRAM buffer for RDMA peer direct access. Add a new KFD alloc memory flag and store as new GEM bo alloc flag. When pin this buffer object to export for RDMA peerdirect access, set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS flag, and then vram_mgr will set TTM_PL_FLAG_CONTIFUOUS flag to ask VRAM buddy allocator to get contiguous VRAM. Remove the 2GB max memory block size limit for contiguous allocation. I'm going to sync up with Arun on this once more, but I think we won't even need the new flag. We will just downgrade the existing flag to be a best effort allocation for contiguous buffers and only use the TTM flag internally to signal that we need to alter it while pinning. sure, I will rebase this patch series to "[PATCH] drm/amdgpu: Modify the contiguous flags behaviour", this will remove the new flag. Will send v2 patch series after Arun's v2 patch. Regards, Philip Regards, Christian. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 7 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 9 +++-- include/uapi/drm/amdgpu_drm.h | 5 + include/uapi/linux/kfd_ioctl.h | 1 + 4 files changed, 20 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 0ae9fd844623..3523b91f8add 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1470,6 +1470,9 @@ static int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo *bo, u32 domain) if (unlikely(ret)) return ret; + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS_BEST_EFFORT) + bo->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; + ret = amdgpu_bo_pin_restricted(bo, domain, 0, 0); if (ret) pr_err("Error in Pinning BO to domain: %d\n", domain); @@ -1712,6 +1715,10 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( alloc_flags = AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; alloc_flags |= (flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) ? AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0; + + /* For contiguous VRAM allocation */ + if (flags & KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT) + alloc_flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS_BEST_EFFORT; } xcp_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ? 0 : fpriv->xcp_id; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 8db880244324..1d6e45e238e1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -516,8 +516,13 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, BUG_ON(min_block_size < mm->chunk_size); - /* Limit maximum size to 2GiB due to SG table limitations */ - size = min(remaining_size, 2ULL << 30); + if (place->flags & TTM_PL_FLAG_CONTIGUOUS) + size = remaining_size; + else + /* Limit maximum size to 2GiB due to SG table limitations + * for no contiguous allocation. + */ + size = min(remaining_size, 2ULL << 30); if ((size >= (u64)pages_per_block << PAGE_SHIFT) && !(size & (((u64)pages_per_block << PAGE_SHIF
Re: [PATCH] drm/amdgpu: Modify the contiguous flags behaviour
On 2024-04-16 02:50, Paneer Selvam, Arunpravin wrote: On 4/16/2024 3:32 AM, Philip Yang wrote: On 2024-04-14 10:57, Arunpravin Paneer Selvam wrote: Now we have two flags for contiguous VRAM buffer allocation. If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS, it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the buffer's placement function. This patch will change the default behaviour of the two flags. This change will simplify the KFD best effort contiguous VRAM allocation, because KFD doesn't need set new GEM_ flag. When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS - This means contiguous is not mandatory. AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS used in couple of places. For page table BO, it is fine as BO size is page size 4K. For 64KB reserved BOs and F/W size related BOs, do all allocation happen at driver initialization before the VRAM is fragmented? - we will try to allocate the contiguous buffer. Say if the allocation fails, we fallback to allocate the individual pages. When we setTTM_PL_FLAG_CONTIGUOUS - This means contiguous allocation is mandatory. - we are setting this in amdgpu_bo_pin_restricted() before bo validation and check this flag in the vram manager file. - if this is set, we should allocate the buffer pages contiguously. the allocation fails, we return -ENOSPC. Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 14 +++-- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 57 +++- 2 files changed, 49 insertions(+), 22 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 8bc79924d171..41926d631563 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -153,8 +153,6 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain) else places[c].flags |= TTM_PL_FLAG_TOPDOWN; - if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) - places[c].flags |= TTM_PL_FLAG_CONTIGUOUS; c++; } @@ -899,6 +897,8 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, { struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); struct ttm_operation_ctx ctx = { false, false }; + struct ttm_place *places = bo->placements; + u32 c = 0; int r, i; if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) @@ -921,16 +921,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, if (bo->tbo.pin_count) { uint32_t mem_type = bo->tbo.resource->mem_type; - uint32_t mem_flags = bo->tbo.resource->placement; if (!(domain & amdgpu_mem_type_to_domain(mem_type))) return -EINVAL; - if ((mem_type == TTM_PL_VRAM) && - (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) && - !(mem_flags & TTM_PL_FLAG_CONTIGUOUS)) - return -EINVAL; - This looks like a bug before, but with this patch, the check makes sense and is needed. ttm_bo_pin(>tbo); if (max_offset != 0) { @@ -968,6 +962,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, bo->placements[i].lpfn = lpfn;
Re: [PATCH] drm/amdgpu: Modify the contiguous flags behaviour
On 2024-04-14 10:57, Arunpravin Paneer Selvam wrote: Now we have two flags for contiguous VRAM buffer allocation. If the application request for AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS, it would set the ttm place TTM_PL_FLAG_CONTIGUOUS flag in the buffer's placement function. This patch will change the default behaviour of the two flags. This change will simplify the KFD best effort contiguous VRAM allocation, because KFD doesn't need set new GEM_ flag. When we set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS - This means contiguous is not mandatory. AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS used in couple of places. For page table BO, it is fine as BO size is page size 4K. For 64KB reserved BOs and F/W size related BOs, do all allocation happen at driver initialization before the VRAM is fragmented? - we will try to allocate the contiguous buffer. Say if the allocation fails, we fallback to allocate the individual pages. When we setTTM_PL_FLAG_CONTIGUOUS - This means contiguous allocation is mandatory. - we are setting this in amdgpu_bo_pin_restricted() before bo validation and check this flag in the vram manager file. - if this is set, we should allocate the buffer pages contiguously. the allocation fails, we return -ENOSPC. Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 14 +++-- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 57 +++- 2 files changed, 49 insertions(+), 22 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 8bc79924d171..41926d631563 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -153,8 +153,6 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain) else places[c].flags |= TTM_PL_FLAG_TOPDOWN; - if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) - places[c].flags |= TTM_PL_FLAG_CONTIGUOUS; c++; } @@ -899,6 +897,8 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, { struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); struct ttm_operation_ctx ctx = { false, false }; + struct ttm_place *places = bo->placements; + u32 c = 0; int r, i; if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) @@ -921,16 +921,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, if (bo->tbo.pin_count) { uint32_t mem_type = bo->tbo.resource->mem_type; - uint32_t mem_flags = bo->tbo.resource->placement; if (!(domain & amdgpu_mem_type_to_domain(mem_type))) return -EINVAL; - if ((mem_type == TTM_PL_VRAM) && - (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) && - !(mem_flags & TTM_PL_FLAG_CONTIGUOUS)) - return -EINVAL; - This looks like a bug before, but with this patch, the check makes sense and is needed. ttm_bo_pin(>tbo); if (max_offset != 0) { @@ -968,6 +962,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, bo->placements[i].lpfn = lpfn; } + if (domain & AMDGPU_GEM_DOMAIN_VRAM && + !WARN_ON(places[c].mem_type != TTM_PL_VRAM)) + places[c].flags |= TTM_PL_FLAG_CONTIGUOUS; + If BO pinned is not allocated with AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS, should pin and return scattered pages because the RDMA support scattered dmabuf. Christian also pointed this out. If (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && bo->placements[i].mem_type == TTM_PL_VRAM) o->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS; r = ttm_bo_validate(>tbo, >placement, ); if (unlikely(r)) { dev_err(adev->dev, "%p pin failed\n", bo); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 8db880244324..ddbf302878f6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -88,6 +88,30 @@ static inline u64 amdgpu_vram_mgr_blocks_size(struct list_head *head) return size; } +static inline unsigned long +amdgpu_vram_find_pages_per_block(struct ttm_buffer_object *tbo, + const struct ttm_place *place, + unsigned long bo_flags) +{ + unsigned long pages_per_block; + + if (bo_flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS || + place->flags & TTM_PL_FLAG_CONTIGUOUS) { + pages_per_block = ~0ul; + } else { +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + pages_per_block = HPAGE_PMD_NR; +#else + /* default to 2MB */ + pages_per_block = 2UL << (20UL - PAGE_SHIFT); +#endif + pages_per_block = max_t(uint32_t, pages_per_block, + tbo->page_alignment); + } + + return pages_per_block; +} + /** * DOC: mem_info_vram_total * @@ -451,8 +475,10 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, struct amdgpu_vram_mgr *mgr = to_vram_mgr(man); struct amdgpu_device *adev
[PATCH 5/6] drm/amdgpu: Skip dma map resource for null RDMA device
To test RDMA using dummy driver on the system without NIC/RDMA device, the get dma pages pass in null device pointer, skip the dma map resource to avoid null device pointer access. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 33 +++- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 1d6e45e238e1..93fb63f4dae5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -674,12 +674,15 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, size_t size = cursor.size; dma_addr_t addr; - addr = dma_map_resource(dev, phys, size, dir, - DMA_ATTR_SKIP_CPU_SYNC); - r = dma_mapping_error(dev, addr); - if (r) - goto error_unmap; - + if (dev) { + addr = dma_map_resource(dev, phys, size, dir, + DMA_ATTR_SKIP_CPU_SYNC); + r = dma_mapping_error(dev, addr); + if (r) + goto error_unmap; + } else { + addr = phys; + } sg_set_page(sg, NULL, size, 0); sg_dma_address(sg) = addr; sg_dma_len(sg) = size; @@ -693,10 +696,10 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, for_each_sgtable_sg((*sgt), sg, i) { if (!sg->length) continue; - - dma_unmap_resource(dev, sg->dma_address, - sg->length, dir, - DMA_ATTR_SKIP_CPU_SYNC); + if (dev) + dma_unmap_resource(dev, sg->dma_address, + sg->length, dir, + DMA_ATTR_SKIP_CPU_SYNC); } sg_free_table(*sgt); @@ -721,10 +724,12 @@ void amdgpu_vram_mgr_free_sgt(struct device *dev, struct scatterlist *sg; int i; - for_each_sgtable_sg(sgt, sg, i) - dma_unmap_resource(dev, sg->dma_address, - sg->length, dir, - DMA_ATTR_SKIP_CPU_SYNC); + if (dev) { + for_each_sgtable_sg(sgt, sg, i) + dma_unmap_resource(dev, sg->dma_address, + sg->length, dir, + DMA_ATTR_SKIP_CPU_SYNC); + } sg_free_table(sgt); kfree(sgt); } -- 2.43.2
[PATCH 2/6] drm/amdgpu: Evict BOs from same process for contiguous allocation
When TTM failed to alloc VRAM, TTM evict BOs from VRAM to system memory then retry the allocation, this currently skips the KFD BOs from the same process because KFD requires all BOs are resident for user queues. If TTM BO with TTM_PL_FLAG_CONTIGUOUS flag to alloc contiguous VRAM, allow TTM evict KFD BOs from the same process, this will evict the user queues first, and restore the queues later after contiguous VRAM allocation. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index a5ceec7820cf..00b8603d73e5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1383,7 +1383,8 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo, */ dma_resv_for_each_fence(_cursor, bo->base.resv, DMA_RESV_USAGE_BOOKKEEP, f) { - if (amdkfd_fence_check_mm(f, current->mm)) + if (amdkfd_fence_check_mm(f, current->mm) && + !(place->flags & TTM_PL_FLAG_CONTIGUOUS)) return false; } -- 2.43.2
[PATCH 6/6] drm/amdkfd: Bump kfd version for contiguous VRAM allocation
Bump the kfd ioctl minor version to delcare the contiguous VRAM allocation flag support. Signed-off-by: Philip Yang --- include/uapi/linux/kfd_ioctl.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index c1394c162d4e..a0af2ef696ea 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -41,9 +41,10 @@ * - 1.13 - Add debugger API * - 1.14 - Update kfd_event_data * - 1.15 - Enable managing mappings in compute VMs with GEM_VA ioctl + * - 1.16 - Add contiguous VRAM allocation flag for RDMA */ #define KFD_IOCTL_MAJOR_VERSION 1 -#define KFD_IOCTL_MINOR_VERSION 15 +#define KFD_IOCTL_MINOR_VERSION 16 struct kfd_ioctl_get_version_args { __u32 major_version;/* from KFD */ -- 2.43.2
[PATCH 4/6] drm/amdkfd: Increase KFD bo restore wait time
TTM allocate contiguous VRAM may takes more than 1 second to evict BOs for larger size RDMA buffer. Because KFD restore bo worker reserves all KFD BOs, then TTM cannot hold the remainning KFD BOs lock to evict them, this may causes TTM failed to alloc contiguous VRAM. Increase the KFD restore BO wait time to 2 seconds, long enough for RDMA pin BO to finish the contiguous VRAM allocation. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index a81ef232fdef..c205e2d3acf9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -698,7 +698,7 @@ struct qcm_process_device { /* KFD Memory Eviction */ /* Approx. wait time before attempting to restore evicted BOs */ -#define PROCESS_RESTORE_TIME_MS 100 +#define PROCESS_RESTORE_TIME_MS 2000 /* Approx. back off time if restore fails due to lack of memory */ #define PROCESS_BACK_OFF_TIME_MS 100 /* Approx. time before evicting the process again */ -- 2.43.2
[PATCH 3/6] drm/amdkfd: Evict BO itself for contiguous allocation
If the BO pages pinned for RDMA is not contiguous on VRAM, evict it to system memory first to free the VRAM space, then allocate contiguous VRAM and then move it from system memory back to VRAM. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 15 ++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 3523b91f8add..9506de1094ec 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1470,8 +1470,21 @@ static int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo *bo, u32 domain) if (unlikely(ret)) return ret; - if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS_BEST_EFFORT) + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS_BEST_EFFORT) { + /* +* If bo is not contiguous on VRAM, move to system memory first to ensure +* we can get contiguous VRAM space after evicting other BOs. +*/ + if (!(bo->tbo.resource->placement & TTM_PL_FLAG_CONTIGUOUS)) { + ret = amdgpu_amdkfd_bo_validate(bo, AMDGPU_GEM_DOMAIN_GTT, false); + if (unlikely(ret)) { + pr_debug("validate bo 0x%p to GTT failed %d\n", >tbo, ret); + return ret; + } + } + bo->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; + } ret = amdgpu_bo_pin_restricted(bo, domain, 0, 0); if (ret) -- 2.43.2
[PATCH 1/6] drm/amdgpu: Support contiguous VRAM allocation
RDMA device with limited scatter-gather capability requires physical address contiguous VRAM buffer for RDMA peer direct access. Add a new KFD alloc memory flag and store as new GEM bo alloc flag. When pin this buffer object to export for RDMA peerdirect access, set AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS flag, and then vram_mgr will set TTM_PL_FLAG_CONTIFUOUS flag to ask VRAM buddy allocator to get contiguous VRAM. Remove the 2GB max memory block size limit for contiguous allocation. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 7 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 9 +++-- include/uapi/drm/amdgpu_drm.h| 5 + include/uapi/linux/kfd_ioctl.h | 1 + 4 files changed, 20 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 0ae9fd844623..3523b91f8add 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1470,6 +1470,9 @@ static int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo *bo, u32 domain) if (unlikely(ret)) return ret; + if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS_BEST_EFFORT) + bo->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; + ret = amdgpu_bo_pin_restricted(bo, domain, 0, 0); if (ret) pr_err("Error in Pinning BO to domain: %d\n", domain); @@ -1712,6 +1715,10 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( alloc_flags = AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; alloc_flags |= (flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) ? AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0; + + /* For contiguous VRAM allocation */ + if (flags & KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT) + alloc_flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS_BEST_EFFORT; } xcp_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ? 0 : fpriv->xcp_id; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 8db880244324..1d6e45e238e1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -516,8 +516,13 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, BUG_ON(min_block_size < mm->chunk_size); - /* Limit maximum size to 2GiB due to SG table limitations */ - size = min(remaining_size, 2ULL << 30); + if (place->flags & TTM_PL_FLAG_CONTIGUOUS) + size = remaining_size; + else + /* Limit maximum size to 2GiB due to SG table limitations +* for no contiguous allocation. +*/ + size = min(remaining_size, 2ULL << 30); if ((size >= (u64)pages_per_block << PAGE_SHIFT) && !(size & (((u64)pages_per_block << PAGE_SHIFT) - 1))) diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index ad21c613fec8..13645abb8e46 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -171,6 +171,11 @@ extern "C" { * may override the MTYPE selected in AMDGPU_VA_OP_MAP. */ #define AMDGPU_GEM_CREATE_EXT_COHERENT (1 << 15) +/* Flag that allocating the BO with best effort for contiguous VRAM. + * If no contiguous VRAM, fallback to scattered allocation. + * Pin the BO for peerdirect RDMA trigger VRAM defragmentation. + */ +#define AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS_BEST_EFFORT (1 << 16) struct drm_amdgpu_gem_create_in { /** the requested memory size */ diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 2040a470ddb4..c1394c162d4e 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -407,6 +407,7 @@ struct kfd_ioctl_acquire_vm_args { #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT (1 << 26) #define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED (1 << 25) #define KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT (1 << 24) +#define KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS_BEST_EFFORT (1 << 23) /* Allocate memory for later SVM (shared virtual memory) mapping. * -- 2.43.2
[PATCH 0/6] Best effort contiguous VRAM allocation
This patch series implement new KFD memory alloc flag for best effort contiguous VRAM allocation, to support peer direct access RDMA device with limited scatter-gather dma capability. Philip Yang (6): drm/amdgpu: Support contiguous VRAM allocation drm/amdgpu: Evict BOs from same process for contiguous allocation drm/amdkfd: Evict BO itself for contiguous allocation drm/amdkfd: Increase KFD bo restore wait time drm/amdgpu: Skip dma map resource for null RDMA device drm/amdkfd: Bump kfd version for contiguous VRAM allocation .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 20 + drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 3 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 42 --- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +- include/uapi/drm/amdgpu_drm.h | 5 +++ include/uapi/linux/kfd_ioctl.h| 4 +- 6 files changed, 57 insertions(+), 19 deletions(-) -- 2.43.2
[PATCH] drm/amdgpu: Fix tlb_cb memory leaking
After updating GPU page table via CPU on large bar system, no fence callback, call amdgpu_vm_tlb_seq_cb directly after command committed to free tlb_cb. memory leaking backtrace from kmemleakd: unreferenced object 0xa036816b00c0 (size 32): backtrace: __kmem_cache_alloc_node+0x3fe/0x4d0 kmalloc_trace+0x2a/0xb0 amdgpu_vm_update_range+0x9b/0x8d0 [amdgpu] amdgpu_vm_clear_freed+0xc1/0x210 [amdgpu] unmap_bo_from_gpuvm.isra.36+0x37/0x50 [amdgpu] amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu+0x118/0x1b0 [amdgpu] kfd_process_device_free_bos+0x7c/0xe0 [amdgpu] kfd_process_wq_release+0x273/0x3c0 [amdgpu] process_scheduled_works+0x2a7/0x500 worker_thread+0x186/0x340 Fixes: 220ecde84bc8 ("drm/amdgpu: implement TLB flush fence") Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 7 ++- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 8af3f0fd3073..d0ef727cd7e1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -901,12 +901,9 @@ amdgpu_vm_tlb_flush(struct amdgpu_vm_update_params *params, { struct amdgpu_vm *vm = params->vm; - if (!fence || !*fence) - return; - tlb_cb->vm = vm; - if (!dma_fence_add_callback(*fence, _cb->cb, - amdgpu_vm_tlb_seq_cb)) { + if (fence && *fence && + !dma_fence_add_callback(*fence, _cb->cb, amdgpu_vm_tlb_seq_cb)) { dma_fence_put(vm->last_tlb_flush); vm->last_tlb_flush = dma_fence_get(*fence); } else { -- 2.43.2
Re: [PATCH 2/3] amd/amdgpu: wait no process running in kfd before resuming device
On 2024-03-26 11:01, Felix Kuehling wrote: On 2024-03-26 10:53, Philip Yang wrote: On 2024-03-25 14:45, Felix Kuehling wrote: On 2024-03-22 15:57, Zhigang Luo wrote: it will cause page fault after device recovered if there is a process running. Signed-off-by: Zhigang Luo Change-Id: Ib1eddb56b69ecd41fe703abd169944154f48b0cd --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 70261eb9b0bb..2867e9186e44 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4974,6 +4974,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, retry: amdgpu_amdkfd_pre_reset(adev); + amdgpu_amdkfd_wait_no_process_running(adev); + This waits for the processes to be terminated. What would cause the processes to be terminated? Why do the processes need to be terminated? Isn't it enough if the processes are removed from the runlist in pre-reset, so they can no longer execute on the GPU? mode 1 reset on SRIOV is much faster then BM, kgd2kfd_pre_reset sends GPU reset event to user space, don't remove queues from the runlist, after mode1 reset is done, there is queue still running and generate vm fault because the GPU page table is gone. I think seeing a page fault during the reset is not a problem. Seeing a page fault after the reset would be a bug. The process should not be on the runlist after the reset is done. Waiting for the process to terminate first looks like a workaround, when the real bug is maybe that we're not updating the process state correctly in pre-reset. All currently running processes should be put into evicted state, so they are not put back on the runlist after the reset. Forgot to mention it is F/W hang issue to trigger GPU reset, there is also error message when kgd2kfd_pre_reset -> kgd2kfd_suspend to evict queues from the runlist, yes, this seems W/A for the real issue related to mode1 reset. Regards, Philip Regards, Felix Regards, Philip Regards, Felix amdgpu_device_stop_pending_resets(adev); if (from_hypervisor)
Re: [PATCH 2/3] amd/amdgpu: wait no process running in kfd before resuming device
On 2024-03-25 14:45, Felix Kuehling wrote: On 2024-03-22 15:57, Zhigang Luo wrote: it will cause page fault after device recovered if there is a process running. Signed-off-by: Zhigang Luo Change-Id: Ib1eddb56b69ecd41fe703abd169944154f48b0cd --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 70261eb9b0bb..2867e9186e44 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4974,6 +4974,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, retry: amdgpu_amdkfd_pre_reset(adev); + amdgpu_amdkfd_wait_no_process_running(adev); + This waits for the processes to be terminated. What would cause the processes to be terminated? Why do the processes need to be terminated? Isn't it enough if the processes are removed from the runlist in pre-reset, so they can no longer execute on the GPU? mode 1 reset on SRIOV is much faster then BM, kgd2kfd_pre_reset sends GPU reset event to user space, don't remove queues from the runlist, after mode1 reset is done, there is queue still running and generate vm fault because the GPU page table is gone. Regards, Philip Regards, Felix amdgpu_device_stop_pending_resets(adev); if (from_hypervisor)
Re: [PATCH] drm/amdkfd: return negative error code in svm_ioctl()
On 2024-03-25 02:31, Su Hui wrote: svm_ioctl() should return negative error code in default case. Fixes: 42de677f7999 ("drm/amdkfd: register svm range") Signed-off-by: Su Hui Good catch, ioctl should return -errno. I will apply it to drm-next. Reviewed-by: Philip Yang --- Ps: When I try to compile this file, there is a error : drivers/gpu/drm/amd/amdkfd/kfd_migrate.c:28:10: fatal error: amdgpu_sync.h: No such file or directory. Maybe there are some steps I missed or this place need to be corrected? Don't know how you compile the driver, amdgpu_sync.h is located under amdgpu folder, amdkfd/Makefile is included from amdgpu/Makefile, which set ccflag-y -I correctly. Regards, Philip drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index f0f7f48af413..41c376f3fd27 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -4147,7 +4147,7 @@ svm_ioctl(struct kfd_process *p, enum kfd_ioctl_svm_op op, uint64_t start, r = svm_range_get_attr(p, mm, start, size, nattrs, attrs); break; default: - r = EINVAL; + r = -EINVAL; break; }
[PATCH] drm/amdgpu: amdgpu_ttm_gart_bind set gtt bound flag
Otherwise amdgpu_ttm_backend_unbind will not clear the gart page table and leave valid mapping entry to the stale system page. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 75c9fd2c6c2a..b0ed10f4de60 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -869,6 +869,7 @@ static void amdgpu_ttm_gart_bind(struct amdgpu_device *adev, amdgpu_gart_bind(adev, gtt->offset, ttm->num_pages, gtt->ttm.dma_address, flags); } + gtt->bound = true; } /* -- 2.35.1
Re: [PATCH v5 1/2] drm/amdgpu: implement TLB flush fence
On 2024-03-06 09:41, Shashank Sharma wrote: From: Christian König The problem is that when (for example) 4k pages are replaced with a single 2M page we need to wait for change to be flushed out by invalidating the TLB before the PT can be freed. Solve this by moving the TLB flush into a DMA-fence object which can be used to delay the freeing of the PT BOs until it is signaled. V2: (Shashank) - rebase - set dma_fence_error only in case of error - add tlb_flush fence only when PT/PD BO is locked (Felix) - use vm->pasid when f is NULL (Mukul) V4: - add a wait for (f->dependency) in tlb_fence_work (Christian) - move the misplaced fence_create call to the end (Philip) V5: - free the f->dependency properly (Christian) Cc: Christian Koenig Cc: Felix Kuehling Cc: Rajneesh Bhardwaj Cc: Alex Deucher Reviewed-by: Shashank Sharma Signed-off-by: Christian König Signed-off-by: Shashank Sharma --- drivers/gpu/drm/amd/amdgpu/Makefile | 3 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c| 10 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h| 4 + .../gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c | 112 ++ 4 files changed, 128 insertions(+), 1 deletion(-) create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile index fa26a4e3a99d..91ab4cf29b5b 100644 --- a/drivers/gpu/drm/amd/amdgpu/Makefile +++ b/drivers/gpu/drm/amd/amdgpu/Makefile @@ -70,7 +70,8 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o amdgpu_kms.o \ amdgpu_cs.o amdgpu_bios.o amdgpu_benchmark.o \ atombios_dp.o amdgpu_afmt.o amdgpu_trace_points.o \ atombios_encoders.o amdgpu_sa.o atombios_i2c.o \ - amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_ib.o amdgpu_pll.o \ + amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_vm_tlb_fence.o \ + amdgpu_ib.o amdgpu_pll.o \ amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \ amdgpu_gtt_mgr.o amdgpu_preempt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o \ amdgpu_atomfirmware.o amdgpu_vf_error.o amdgpu_sched.o \ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 0960e0a665d3..310aae6fb49b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -988,6 +988,15 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, r = vm->update_funcs->commit(, fence); + /* Prepare a TLB flush fence to be attached to PTs */ + if (!unlocked && params.needs_flush && vm->is_compute_context) { + amdgpu_vm_tlb_fence_create(adev, vm, fence); + + /* Makes sure no PD/PT is freed before the flush */ + dma_resv_add_fence(vm->root.bo->tbo.base.resv, *fence, + DMA_RESV_USAGE_BOOKKEEP); + } + error_unlock: amdgpu_vm_eviction_unlock(vm); drm_dev_exit(idx); @@ -2237,6 +2246,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, mutex_init(>eviction_lock); vm->evicting = false; + vm->tlb_fence_context = dma_fence_context_alloc(1); r = amdgpu_vm_pt_create(adev, vm, adev->vm_manager.root_level, false, , xcp_id); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 64b3f69efa57..298f604b8e5f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -341,6 +341,7 @@ struct amdgpu_vm { atomic64_t tlb_seq; uint64_t tlb_seq_va; uint64_t *tlb_seq_cpu_addr; + uint64_t tlb_fence_context; atomic64_t kfd_last_flushed_seq; @@ -594,5 +595,8 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev, uint64_t addr, uint32_t status, unsigned int vmhub); +void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev, + struct amdgpu_vm *vm, + struct dma_fence **fence); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c new file mode 100644 index ..51cddfa3f1e8 --- /dev/null +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT +/* + * Copyright 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
Re: [PATCH v4 2/2] drm/amdgpu: sync page table freeing with tlb flush
On 2024-03-01 06:07, Shashank Sharma wrote: The idea behind this patch is to delay the freeing of PT entry objects until the TLB flush is done. This patch: - Adds a tlb_flush_waitlist which will keep the objects that need to be freed after tlb_flush - Adds PT entries in this list in amdgpu_vm_pt_free_dfs, instead of freeing them immediately. - Exports function amdgpu_vm_pt_free to be called dircetly. - Adds a 'force' input bool to amdgpu_vm_pt_free_dfs to differentiate between immediate freeing of the BOs (like from amdgpu_vm_pt_free_root) vs delayed freeing. V2: rebase V4: (Christian) - add only locked PTEs entries in TLB flush waitlist. - do not create a separate function for list flush. - do not create a new lock for TLB flush. - there is no need to wait on tlb_flush_fence exclusively. Cc: Christian König Cc: Alex Deucher Cc: Felix Kuehling Cc: Rajneesh Bhardwaj Signed-off-by: Shashank Sharma --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c| 10 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h| 4 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 21 ++--- 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 310aae6fb49b..94581a1fe34f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -990,11 +990,20 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, /* Prepare a TLB flush fence to be attached to PTs */ if (!unlocked && params.needs_flush && vm->is_compute_context) { + struct amdgpu_vm_bo_base *entry, *next; + amdgpu_vm_tlb_fence_create(adev, vm, fence); /* Makes sure no PD/PT is freed before the flush */ dma_resv_add_fence(vm->root.bo->tbo.base.resv, *fence, DMA_RESV_USAGE_BOOKKEEP); + + if (list_empty(>tlb_flush_waitlist)) + goto error_unlock; + + /* Now actually free the waitlist */ + list_for_each_entry_safe(entry, next, >tlb_flush_waitlist, vm_status) + amdgpu_vm_pt_free(entry); } error_unlock: @@ -2214,6 +2223,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, INIT_LIST_HEAD(>pt_freed); INIT_WORK(>pt_free_work, amdgpu_vm_pt_free_work); INIT_KFIFO(vm->faults); + INIT_LIST_HEAD(>tlb_flush_waitlist); r = amdgpu_seq64_alloc(adev, >tlb_seq_va, >tlb_seq_cpu_addr); if (r) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 298f604b8e5f..ba374c2c61bd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -343,6 +343,9 @@ struct amdgpu_vm { uint64_t *tlb_seq_cpu_addr; uint64_t tlb_fence_context; + /* temporary storage of PT BOs until the TLB flush */ + struct list_head tlb_flush_waitlist; + atomic64_t kfd_last_flushed_seq; /* How many times we had to re-generate the page tables */ @@ -545,6 +548,7 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params, uint64_t start, uint64_t end, uint64_t dst, uint64_t flags); void amdgpu_vm_pt_free_work(struct work_struct *work); +void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base *entry); #if defined(CONFIG_DEBUG_FS) void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c index 95dc0afdaffb..cb14e5686c0f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c @@ -636,7 +636,7 @@ static int amdgpu_vm_pt_alloc(struct amdgpu_device *adev, * * @entry: PDE to free */ -static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base *entry) +void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base *entry) { struct amdgpu_bo *shadow; @@ -685,13 +685,15 @@ void amdgpu_vm_pt_free_work(struct work_struct *work) * @vm: amdgpu vm structure * @start: optional cursor where to start freeing PDs/PTs * @unlocked: vm resv unlock status + * @force: force free all PDs/PTs without waiting for TLB flush * * Free the page directory or page table level and all sub levels. */ static void amdgpu_vm_pt_free_dfs(struct amdgpu_device *adev, struct amdgpu_vm *vm, struct amdgpu_vm_pt_cursor *start, - bool unlocked) + bool unlocked, + bool force) { struct amdgpu_vm_pt_cursor cursor; struct amdgpu_vm_bo_base *entry; @@ -708,11 +710,15 @@ static void amdgpu_vm_pt_free_dfs(struct amdgpu_device *adev, return; } - for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry) - amdgpu_vm_pt_free(entry); I feel like if we attach tlb flush fence before free pt bo, then don't need tlb_flush_waitlist. Regards, Philip + for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry) { + if (!force) + list_move(>vm_status, >tlb_flush_waitlist); + else + amdgpu_vm_pt_free(entry); + } if
Re: [PATCH v4 1/2] drm/amdgpu: implement TLB flush fence
On 2024-03-01 06:07, Shashank Sharma wrote: From: Christian König The problem is that when (for example) 4k pages are replaced with a single 2M page we need to wait for change to be flushed out by invalidating the TLB before the PT can be freed. Solve this by moving the TLB flush into a DMA-fence object which can be used to delay the freeing of the PT BOs until it is signaled. V2: (Shashank) - rebase - set dma_fence_error only in case of error - add tlb_flush fence only when PT/PD BO is locked (Felix) - use vm->pasid when f is NULL (Mukul) V4: - add a wait for (f->dependency) in tlb_fence_work (Christian) - move the misplaced fence_create call to the end (Philip) Cc: Christian Koenig Cc: Felix Kuehling Cc: Rajneesh Bhardwaj Cc: Alex Deucher Signed-off-by: Christian König Signed-off-by: Shashank Sharma --- drivers/gpu/drm/amd/amdgpu/Makefile | 3 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c| 10 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h| 4 + .../gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c | 111 ++ 4 files changed, 127 insertions(+), 1 deletion(-) create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile index fa26a4e3a99d..91ab4cf29b5b 100644 --- a/drivers/gpu/drm/amd/amdgpu/Makefile +++ b/drivers/gpu/drm/amd/amdgpu/Makefile @@ -70,7 +70,8 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o amdgpu_kms.o \ amdgpu_cs.o amdgpu_bios.o amdgpu_benchmark.o \ atombios_dp.o amdgpu_afmt.o amdgpu_trace_points.o \ atombios_encoders.o amdgpu_sa.o atombios_i2c.o \ - amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_ib.o amdgpu_pll.o \ + amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_vm_tlb_fence.o \ + amdgpu_ib.o amdgpu_pll.o \ amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \ amdgpu_gtt_mgr.o amdgpu_preempt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o \ amdgpu_atomfirmware.o amdgpu_vf_error.o amdgpu_sched.o \ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 0960e0a665d3..310aae6fb49b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -988,6 +988,15 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, r = vm->update_funcs->commit(, fence); + /* Prepare a TLB flush fence to be attached to PTs */ + if (!unlocked && params.needs_flush && vm->is_compute_context) { + amdgpu_vm_tlb_fence_create(adev, vm, fence); + + /* Makes sure no PD/PT is freed before the flush */ + dma_resv_add_fence(vm->root.bo->tbo.base.resv, *fence, + DMA_RESV_USAGE_BOOKKEEP); + } + Adding fence here seems too late, the fence has to add before calling amdgpu_vm_pt_free_dfs inside amdgpu_vm_ptes_update. error_unlock: amdgpu_vm_eviction_unlock(vm); drm_dev_exit(idx); @@ -2237,6 +2246,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, mutex_init(>eviction_lock); vm->evicting = false; + vm->tlb_fence_context = dma_fence_context_alloc(1); r = amdgpu_vm_pt_create(adev, vm, adev->vm_manager.root_level, false, , xcp_id); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 64b3f69efa57..298f604b8e5f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -341,6 +341,7 @@ struct amdgpu_vm { atomic64_t tlb_seq; uint64_t tlb_seq_va; uint64_t *tlb_seq_cpu_addr; + uint64_t tlb_fence_context; atomic64_t kfd_last_flushed_seq; @@ -594,5 +595,8 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev, uint64_t addr, uint32_t status, unsigned int vmhub); +void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev, + struct amdgpu_vm *vm, + struct dma_fence **fence); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c new file mode 100644 index ..54c33c24fa46 --- /dev/null +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT +/* + * Copyright 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +
Re: [PATCH v3 3/3] drm/amdgpu: sync page table freeing with tlb flush
On 2024-02-23 08:42, Shashank Sharma wrote: This patch: - adds a new list in amdgou_vm to hold the VM PT entries being freed - waits for the TLB flush using the vm->tlb_flush_fence - actually frees the PT BOs V2: rebase V3: Do not attach the tlb_fence to the entries, rather add the entries to a list and delay their freeing (Christian) Cc: Christian König Cc: Alex Deucher Cc: Felix Kuehling Cc: Rajneesh Bhardwaj Signed-off-by: Shashank Sharma --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c| 6 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h| 6 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 51 --- 3 files changed, 58 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 67c690044b97..eebb73f2c2ef 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -939,6 +939,10 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, /* Makes sure no PD/PT is freed before the flush */ dma_resv_add_fence(vm->root.bo->tbo.base.resv, *fence, DMA_RESV_USAGE_BOOKKEEP); + + mutex_lock(>tlb_fence_lock); + vm->tlb_fence_last = *fence; + mutex_unlock(>tlb_fence_lock); } amdgpu_res_first(pages_addr ? NULL : res, offset, @@ -2212,6 +2216,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, INIT_LIST_HEAD(>freed); INIT_LIST_HEAD(>done); INIT_LIST_HEAD(>pt_freed); + INIT_LIST_HEAD(>tlb_flush_waitlist); INIT_WORK(>pt_free_work, amdgpu_vm_pt_free_work); INIT_KFIFO(vm->faults); @@ -2244,6 +2249,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, vm->last_unlocked = dma_fence_get_stub(); vm->generation = 0; + mutex_init(>tlb_fence_lock); mutex_init(>eviction_lock); vm->evicting = false; vm->tlb_fence_context = dma_fence_context_alloc(1); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 8e6fd25d07b7..77f10ed80973 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -334,6 +334,10 @@ struct amdgpu_vm { uint64_t *tlb_seq_cpu_addr; uint64_t tlb_fence_context; + struct mutex tlb_fence_lock; + struct dma_fence *tlb_fence_last; + struct list_head tlb_flush_waitlist; + atomic64_t kfd_last_flushed_seq; /* How many times we had to re-generate the page tables */ @@ -379,6 +383,8 @@ struct amdgpu_vm { /* cached fault info */ struct amdgpu_vm_fault_info fault_info; + + int count_bos; }; struct amdgpu_vm_manager { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c index 95dc0afdaffb..57ea95c5c085 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c @@ -643,13 +643,13 @@ static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base *entry) if (!entry->bo) return; - entry->bo->vm_bo = NULL; shadow = amdgpu_bo_shadowed(entry->bo); if (shadow) { ttm_bo_set_bulk_move(>tbo, NULL); amdgpu_bo_unref(); } ttm_bo_set_bulk_move(>bo->tbo, NULL); + entry->bo->vm_bo = NULL; spin_lock(>vm->status_lock); list_del(>vm_status); @@ -657,6 +657,38 @@ static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base *entry) amdgpu_bo_unref(>bo); } +static void amdgpu_vm_pt_flush_waitlist(struct amdgpu_vm *vm) +{ + struct amdgpu_vm_bo_base *entry, *next; + LIST_HEAD(tlb_flush_waitlist); + + if (!vm || list_empty(>tlb_flush_waitlist)) + return; + + /* Wait for pending TLB flush before freeing PT BOs */ + mutex_lock(>tlb_fence_lock); + if (vm->tlb_fence_last && !dma_fence_is_signaled(vm->tlb_fence_last)) { + if (dma_fence_wait_timeout(vm->tlb_fence_last, false, + MAX_SCHEDULE_TIMEOUT) <= 0) { + DRM_ERROR("Timedout waiting for TLB flush, not freeing PT BOs\n"); + mutex_unlock(>tlb_fence_lock); + return; + } + + vm->tlb_fence_last = NULL; + } + + /* Save the waitlist locally and reset the flushlist */ + list_splice_init(>tlb_flush_waitlist, _flush_waitlist); + mutex_unlock(>tlb_fence_lock); + + /* Now free the entries */ + list_for_each_entry_safe(entry, next, _flush_waitlist, vm_status) { + if (entry) + amdgpu_vm_pt_free(entry); + } +} + void amdgpu_vm_pt_free_work(struct work_struct *work) { struct amdgpu_vm_bo_base *entry, *next; @@ -673,7 +705,7 @@ void amdgpu_vm_pt_free_work(struct work_struct *work) amdgpu_bo_reserve(vm->root.bo, true); list_for_each_entry_safe(entry, next, _freed, vm_status) - amdgpu_vm_pt_free(entry); + list_move(>vm_status, >tlb_flush_waitlist); amdgpu_bo_unreserve(vm->root.bo); } @@ -708,11 +740,17 @@ static void amdgpu_vm_pt_free_dfs(struct amdgpu_device *adev, return; } - for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry) - amdgpu_vm_pt_free(entry); + mutex_lock(>tlb_fence_lock); + + for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry) { + if
Re: [PATCH v3 2/3] drm/amdgpu: implement TLB flush fence
On 2024-02-23 11:58, Philip Yang wrote: On 2024-02-23 08:42, Shashank Sharma wrote: From: Christian König The problem is that when (for example) 4k pages are replaced with a single 2M page we need to wait for change to be flushed out by invalidating the TLB before the PT can be freed. Solve this by moving the TLB flush into a DMA-fence object which can be used to delay the freeing of the PT BOs until it is signaled. V2: (Shashank) - rebase - set dma_fence_error only in case of error - add tlb_flush fence only when PT/PD BO is locked (Felix) - use vm->pasid when f is NULL (Mukul) Cc: Christian Koenig Cc: Felix Kuehling Cc: Rajneesh Bhardwaj Cc: Alex Deucher Signed-off-by: Christian König Signed-off-by: Shashank Sharma --- drivers/gpu/drm/amd/amdgpu/Makefile | 3 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c| 10 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h| 4 + .../gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c | 106 ++ 4 files changed, 122 insertions(+), 1 deletion(-) create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile index 4c989da4d2f3..fdbb3d770c7b 100644 --- a/drivers/gpu/drm/amd/amdgpu/Makefile +++ b/drivers/gpu/drm/amd/amdgpu/Makefile @@ -70,7 +70,8 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o amdgpu_kms.o \ amdgpu_cs.o amdgpu_bios.o amdgpu_benchmark.o \ atombios_dp.o amdgpu_afmt.o amdgpu_trace_points.o \ atombios_encoders.o amdgpu_sa.o atombios_i2c.o \ - amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_ib.o amdgpu_pll.o \ + amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_vm_tlb_fence.o \ + amdgpu_ib.o amdgpu_pll.o \ amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \ amdgpu_gtt_mgr.o amdgpu_preempt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o \ amdgpu_atomfirmware.o amdgpu_vf_error.o amdgpu_sched.o \ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 0960e0a665d3..67c690044b97 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -932,6 +932,15 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, if (r) goto error_unlock; + /* Prepare a TLB flush fence to be attached to PTs */ + if (!unlocked && params.needs_flush && vm->is_compute_context) { + amdgpu_vm_tlb_fence_create(adev, vm, fence); + + /* Makes sure no PD/PT is freed before the flush */ + dma_resv_add_fence(vm->root.bo->tbo.base.resv, *fence, + DMA_RESV_USAGE_BOOKKEEP); + } + amdgpu_res_first(pages_addr ? NULL : res, offset, (last - start + 1) * AMDGPU_GPU_PAGE_SIZE, ); while (cursor.remaining) { @@ -2237,6 +2246,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, mutex_init(>eviction_lock); vm->evicting = false; + vm->tlb_fence_context = dma_fence_context_alloc(1); r = amdgpu_vm_pt_create(adev, vm, adev->vm_manager.root_level, false, , xcp_id); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index ac9380afcb69..8e6fd25d07b7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -332,6 +332,7 @@ struct amdgpu_vm { atomic64_t tlb_seq; uint64_t tlb_seq_va; uint64_t *tlb_seq_cpu_addr; + uint64_t tlb_fence_context; atomic64_t kfd_last_flushed_seq; @@ -585,5 +586,8 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev, uint64_t addr, uint32_t status, unsigned int vmhub); +void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev, + struct amdgpu_vm *vm, + struct dma_fence **fence); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c new file mode 100644 index ..569681badd7c --- /dev/null +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c @@ -0,0 +1,106 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT +/* + * Copyright 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILI
Re: [PATCH v3 2/3] drm/amdgpu: implement TLB flush fence
On 2024-02-23 08:42, Shashank Sharma wrote: From: Christian König The problem is that when (for example) 4k pages are replaced with a single 2M page we need to wait for change to be flushed out by invalidating the TLB before the PT can be freed. Solve this by moving the TLB flush into a DMA-fence object which can be used to delay the freeing of the PT BOs until it is signaled. V2: (Shashank) - rebase - set dma_fence_error only in case of error - add tlb_flush fence only when PT/PD BO is locked (Felix) - use vm->pasid when f is NULL (Mukul) Cc: Christian Koenig Cc: Felix Kuehling Cc: Rajneesh Bhardwaj Cc: Alex Deucher Signed-off-by: Christian König Signed-off-by: Shashank Sharma --- drivers/gpu/drm/amd/amdgpu/Makefile | 3 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c| 10 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h| 4 + .../gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c | 106 ++ 4 files changed, 122 insertions(+), 1 deletion(-) create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile index 4c989da4d2f3..fdbb3d770c7b 100644 --- a/drivers/gpu/drm/amd/amdgpu/Makefile +++ b/drivers/gpu/drm/amd/amdgpu/Makefile @@ -70,7 +70,8 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o amdgpu_kms.o \ amdgpu_cs.o amdgpu_bios.o amdgpu_benchmark.o \ atombios_dp.o amdgpu_afmt.o amdgpu_trace_points.o \ atombios_encoders.o amdgpu_sa.o atombios_i2c.o \ - amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_ib.o amdgpu_pll.o \ + amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_vm_tlb_fence.o \ + amdgpu_ib.o amdgpu_pll.o \ amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \ amdgpu_gtt_mgr.o amdgpu_preempt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o \ amdgpu_atomfirmware.o amdgpu_vf_error.o amdgpu_sched.o \ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 0960e0a665d3..67c690044b97 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -932,6 +932,15 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, if (r) goto error_unlock; + /* Prepare a TLB flush fence to be attached to PTs */ + if (!unlocked && params.needs_flush && vm->is_compute_context) { + amdgpu_vm_tlb_fence_create(adev, vm, fence); + + /* Makes sure no PD/PT is freed before the flush */ + dma_resv_add_fence(vm->root.bo->tbo.base.resv, *fence, + DMA_RESV_USAGE_BOOKKEEP); + } + amdgpu_res_first(pages_addr ? NULL : res, offset, (last - start + 1) * AMDGPU_GPU_PAGE_SIZE, ); while (cursor.remaining) { @@ -2237,6 +2246,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, mutex_init(>eviction_lock); vm->evicting = false; + vm->tlb_fence_context = dma_fence_context_alloc(1); r = amdgpu_vm_pt_create(adev, vm, adev->vm_manager.root_level, false, , xcp_id); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index ac9380afcb69..8e6fd25d07b7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -332,6 +332,7 @@ struct amdgpu_vm { atomic64_t tlb_seq; uint64_t tlb_seq_va; uint64_t *tlb_seq_cpu_addr; + uint64_t tlb_fence_context; atomic64_t kfd_last_flushed_seq; @@ -585,5 +586,8 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev, uint64_t addr, uint32_t status, unsigned int vmhub); +void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev, + struct amdgpu_vm *vm, + struct dma_fence **fence); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c new file mode 100644 index ..569681badd7c --- /dev/null +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c @@ -0,0 +1,106 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT +/* + * Copyright 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES
Re: [PATCH] drm/amdgpu: break COW for user ptr during fork()
On 2024-02-21 21:01, Lang Yu wrote: This is useful to prevent copy-on-write semantics from changing the physical location of a page if the parent writes to it after a fork(). Signed-off-by: Lang Yu --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 1 + drivers/gpu/drm/amd/amdkfd/kfd_svm.c| 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 75c9fd2c6c2a..2ee0af3c41b1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -693,6 +693,7 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages, } readonly = amdgpu_ttm_tt_is_readonly(ttm); + vm_flags_set(vma, VM_DONTCOPY); This will break user mode because the forked child process cannot access this vma/userptr. This can be set by application if needed, using madvise(...MADV_DONTFORK) to avoid COW after fork. Regards, Philip r = amdgpu_hmm_range_get_pages(>notifier, start, ttm->num_pages, readonly, NULL, pages, range); out_unlock: diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 6aa032731ddc..607a8f68f26f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -1674,6 +1674,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm, next = min(vma->vm_end, end); npages = (next - addr) >> PAGE_SHIFT; WRITE_ONCE(p->svms.faulting_task, current); + vm_flags_set(vma, VM_DONTCOPY); r = amdgpu_hmm_range_get_pages(>notifier, addr, npages, readonly, owner, NULL, _range);
Re: [PATCH 1/2] drm/amdkfd: Document and define SVM event tracing macro
On 2024-02-16 15:16, Felix Kuehling wrote: On 2024-02-15 10:18, Philip Yang wrote: Document how to use SMI system management interface to receive SVM events. Define SVM events message string format macro that could use by user mode for sscanf to parse the event. Add it to uAPI header file to make it obvious that is changing uAPI in future. No functional changes. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 51 +++--- include/uapi/linux/kfd_ioctl.h | 77 - 2 files changed, 102 insertions(+), 26 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index d9953c2b2661..85465eb303a9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -225,15 +225,16 @@ void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset) event = KFD_SMI_EVENT_GPU_PRE_RESET; ++(dev->reset_seq_num); } - kfd_smi_event_add(0, dev, event, "%x\n", dev->reset_seq_num); + kfd_smi_event_add(0, dev, event, + KFD_EVENT_FMT_UPDATE_GPU_RESET(dev->reset_seq_num)); } void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev, uint64_t throttle_bitmask) { - kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, "%llx:%llx\n", - throttle_bitmask, - amdgpu_dpm_get_thermal_throttling_counter(dev->adev)); + kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, + KFD_EVENT_FMT_UPDATE_THERMAL_THROTTLING(throttle_bitmask, + amdgpu_dpm_get_thermal_throttling_counter(dev->adev))); } void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid) @@ -246,8 +247,8 @@ void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid) if (!task_info.pid) return; - kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, "%x:%s\n", - task_info.pid, task_info.task_name); + kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, + KFD_EVENT_FMT_VMFAULT(task_info.pid, task_info.task_name)); } void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid, @@ -255,16 +256,16 @@ void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid, ktime_t ts) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_START, - "%lld -%d @%lx(%x) %c\n", ktime_to_ns(ts), pid, - address, node->id, write_fault ? 'W' : 'R'); + KFD_EVENT_FMT_PAGEFAULT_START(ktime_to_ns(ts), pid, + address, node->id, write_fault ? 'W' : 'R')); } void kfd_smi_event_page_fault_end(struct kfd_node *node, pid_t pid, unsigned long address, bool migration) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_END, - "%lld -%d @%lx(%x) %c\n", ktime_get_boottime_ns(), - pid, address, node->id, migration ? 'M' : 'U'); + KFD_EVENT_FMT_PAGEFAULT_END(ktime_get_boottime_ns(), + pid, address, node->id, migration ? 'M' : 'U')); } void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid, @@ -274,9 +275,9 @@ void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid, uint32_t trigger) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE
Re: [PATCH 1/2] drm/amdkfd: Document and define SVM event tracing macro
On 2024-02-15 12:54, Chen, Xiaogang wrote: On 2/15/2024 9:18 AM, Philip Yang wrote: Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding. Document how to use SMI system management interface to receive SVM events. Define SVM events message string format macro that could use by user mode for sscanf to parse the event. Add it to uAPI header file to make it obvious that is changing uAPI in future. No functional changes. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 51 +++--- include/uapi/linux/kfd_ioctl.h | 77 - 2 files changed, 102 insertions(+), 26 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index d9953c2b2661..85465eb303a9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -225,15 +225,16 @@ void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset) event = KFD_SMI_EVENT_GPU_PRE_RESET; ++(dev->reset_seq_num); } - kfd_smi_event_add(0, dev, event, "%x\n", dev->reset_seq_num); + kfd_smi_event_add(0, dev, event, + KFD_EVENT_FMT_UPDATE_GPU_RESET(dev->reset_seq_num)); } void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev, uint64_t throttle_bitmask) { - kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, "%llx:%llx\n", - throttle_bitmask, - amdgpu_dpm_get_thermal_throttling_counter(dev->adev)); + kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, + KFD_EVENT_FMT_UPDATE_THERMAL_THROTTLING(throttle_bitmask, + amdgpu_dpm_get_thermal_throttling_counter(dev->adev))); } void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid) @@ -246,8 +247,8 @@ void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid) if (!task_info.pid) return; - kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, "%x:%s\n", - task_info.pid, task_info.task_name); + kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, + KFD_EVENT_FMT_VMFAULT(task_info.pid, task_info.task_name)); } void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid, @@ -255,16 +256,16 @@ void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid, ktime_t ts) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_START, - "%lld -%d @%lx(%x) %c\n", ktime_to_ns(ts), pid, - address, node->id, write_fault ? 'W' : 'R'); + KFD_EVENT_FMT_PAGEFAULT_START(ktime_to_ns(ts), pid, + address, node->id, write_fault ? 'W' : 'R')); } void kfd_smi_event_page_fault_end(struct kfd_node *node, pid_t pid, unsigned long address, bool migration) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_END, - "%lld -%d @%lx(%x) %c\n", ktime_get_boottime_ns(), - pid, address, node->id, migration ? 'M' : 'U'); + KFD_EVENT_FMT_PAGEFAULT_END(ktime_get_bootti
[PATCH 2/2] drm/amdkfd: Output migrate end event if migration failed
To track the migrate end-event in case of a migration failure, always output migrate end event, with the failure result added to the existing migrate end event string. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c| 16 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 5 +++-- drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 2 +- include/uapi/linux/kfd_ioctl.h | 7 --- 4 files changed, 16 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index 480e222364d5..23cf9484331e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -445,15 +445,15 @@ svm_migrate_vma_to_vram(struct kfd_node *node, struct svm_range *prange, pr_debug("successful/cpages/npages 0x%lx/0x%lx/0x%lx\n", mpages, cpages, migrate.npages); - kfd_smi_event_migration_end(node, p->lead_thread->pid, - start >> PAGE_SHIFT, end >> PAGE_SHIFT, - 0, node->id, trigger); - svm_range_dma_unmap_dev(adev->dev, scratch, 0, npages); out_free: kvfree(buf); out: + kfd_smi_event_migration_end(node, p->lead_thread->pid, + start >> PAGE_SHIFT, end >> PAGE_SHIFT, + 0, node->id, trigger, r); + if (!r && mpages) { pdd = svm_range_get_pdd_by_node(prange, node); if (pdd) @@ -737,15 +737,15 @@ svm_migrate_vma_to_ram(struct kfd_node *node, struct svm_range *prange, svm_migrate_copy_done(adev, mfence); migrate_vma_finalize(); - kfd_smi_event_migration_end(node, p->lead_thread->pid, - start >> PAGE_SHIFT, end >> PAGE_SHIFT, - node->id, 0, trigger); - svm_range_dma_unmap_dev(adev->dev, scratch, 0, npages); out_free: kvfree(buf); out: + kfd_smi_event_migration_end(node, p->lead_thread->pid, + start >> PAGE_SHIFT, end >> PAGE_SHIFT, + node->id, 0, trigger, r); + if (!r && cpages) { mpages = cpages - upages; pdd = svm_range_get_pdd_by_node(prange, node); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index 85465eb303a9..d1a567f8a8d9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -282,11 +282,12 @@ void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid, void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid, unsigned long start, unsigned long end, -uint32_t from, uint32_t to, uint32_t trigger) +uint32_t from, uint32_t to, uint32_t trigger, +int result) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_END, KFD_EVENT_FMT_MIGRATE_END(ktime_get_boottime_ns(), pid, - start, end - start, from, to, trigger)); + start, end - start, from, to, trigger, result)); } void kfd_smi_event_queue_eviction(struct kfd_node *node, pid_t pid, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h index fa95c2dfd587..6c99eaa39f09 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h @@ -41,7 +41,7 @@ void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid, uint32_t trigger); void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid, unsigned long start, unsigned long end, -uint32_t from, uint32_t to, uint32_t trigger); +uint32_t from, uint32_t to, uint32_t trigger, int r); void kfd_smi_event_queue_eviction(struct kfd_node *node, pid_t pid, uint32_t trigger); void kfd_smi_event_queue_restore(struct kfd_node *node, pid_t pid); diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 430c01f4148b..5220670a434d 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -601,6 +601,7 @@ struct kfd_ioctl_smi_events_args { *migrate_update: the GPU page is recovered by 'M' for migrate, 'U' for update *rescheduled: 'R' if the queue restore failed and rescheduled to try again *rw: 'W' for write page fault, 'R' for read page fault + *result: page mirgate result, 0 for success, otherwise error code */ #define KFD_EVENT_FMT_UPDATE_GP
[PATCH 1/2] drm/amdkfd: Document and define SVM event tracing macro
Document how to use SMI system management interface to receive SVM events. Define SVM events message string format macro that could use by user mode for sscanf to parse the event. Add it to uAPI header file to make it obvious that is changing uAPI in future. No functional changes. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 51 +++--- include/uapi/linux/kfd_ioctl.h | 77 - 2 files changed, 102 insertions(+), 26 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index d9953c2b2661..85465eb303a9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -225,15 +225,16 @@ void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset) event = KFD_SMI_EVENT_GPU_PRE_RESET; ++(dev->reset_seq_num); } - kfd_smi_event_add(0, dev, event, "%x\n", dev->reset_seq_num); + kfd_smi_event_add(0, dev, event, + KFD_EVENT_FMT_UPDATE_GPU_RESET(dev->reset_seq_num)); } void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev, uint64_t throttle_bitmask) { - kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, "%llx:%llx\n", - throttle_bitmask, - amdgpu_dpm_get_thermal_throttling_counter(dev->adev)); + kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, + KFD_EVENT_FMT_UPDATE_THERMAL_THROTTLING(throttle_bitmask, + amdgpu_dpm_get_thermal_throttling_counter(dev->adev))); } void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid) @@ -246,8 +247,8 @@ void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid) if (!task_info.pid) return; - kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, "%x:%s\n", - task_info.pid, task_info.task_name); + kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, + KFD_EVENT_FMT_VMFAULT(task_info.pid, task_info.task_name)); } void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid, @@ -255,16 +256,16 @@ void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid, ktime_t ts) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_START, - "%lld -%d @%lx(%x) %c\n", ktime_to_ns(ts), pid, - address, node->id, write_fault ? 'W' : 'R'); + KFD_EVENT_FMT_PAGEFAULT_START(ktime_to_ns(ts), pid, + address, node->id, write_fault ? 'W' : 'R')); } void kfd_smi_event_page_fault_end(struct kfd_node *node, pid_t pid, unsigned long address, bool migration) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_END, - "%lld -%d @%lx(%x) %c\n", ktime_get_boottime_ns(), - pid, address, node->id, migration ? 'M' : 'U'); + KFD_EVENT_FMT_PAGEFAULT_END(ktime_get_boottime_ns(), + pid, address, node->id, migration ? 'M' : 'U')); } void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid, @@ -274,9 +275,9 @@ void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid, uint32_t trigger) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_START, - "%lld -%d @%lx(%lx) %x->%x %x:%x %d\n", - ktime_get_boottime_ns(), pid, start, end - start, - from, to, prefetch_loc, preferred_loc, trigger); + KFD_EVENT_FMT_MIGRATE_START(ktime_get_boottime_ns(), + pid, start, end - start, from, to, prefetch_loc, + preferred_loc, trigger)); } void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid, @@ -284,24 +285,23 @@ void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid, uint32_t from, uint32_t to, uint32_t trigger) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_END, - "%lld -%d @%lx(%lx) %x->%x %d\n", - ktime_get_boottime_ns(), pid, start, end - start, - from, to, trigger); + KFD_EVENT_FMT_MIGRATE_END(ktime_get_boottime_ns(), pid, + start, end - start, from, to, trigger)); } void kfd_smi_event_queue_eviction(struct kfd_node *node, pid_t pid, uint32_t trigger) { kfd_smi_event_add(pid, node, KFD_SMI_EVENT_QUEUE_EVICTION, -
[PATCH 2/2] drm/amdgpu: Improve huge page mapping update
Update huge page mapping, ex 2MB address and size aligned, we alloc PTB bo, and then free the PTB bo after updating PDE0 as PTE. If fragment size >= parent_shift, don't alloc PT bo, because we will update PDE entry, this will improve the huge page mapping update by removing the extra PTB bo alloc and free. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 8 +--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c index a3d609655ce3..ef3ef03e50ab 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c @@ -916,7 +916,11 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params, uint64_t incr, entry_end, pe_start; struct amdgpu_bo *pt; - if (!params->unlocked) { + shift = amdgpu_vm_pt_level_shift(adev, cursor.level); + parent_shift = amdgpu_vm_pt_level_shift(adev, cursor.level - 1); + + if (!params->unlocked && + (adev->asic_type < CHIP_VEGA10 || frag < parent_shift)) { /* make sure that the page tables covering the * address range are actually allocated */ @@ -926,8 +930,6 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params, return r; } - shift = amdgpu_vm_pt_level_shift(adev, cursor.level); - parent_shift = amdgpu_vm_pt_level_shift(adev, cursor.level - 1); if (params->unlocked) { /* Unlocked updates are only allowed on the leaves */ if (amdgpu_vm_pt_descendant(adev, )) -- 2.35.1
[PATCH 1/2] drm/amdgpu: Unmap only clear the page table leaves
SVM migration unmap pages from GPU and then update mapping to GPU to recover page fault. Currently unmap clears the PDE entry for range length >= huge page and free PTB bo, update mapping to alloc new PT bo. There is race bug that the freed entry bo maybe still on the pt_free list, reused when updating mapping and then freed, leave invalid PDE entry and cause GPU page fault. By setting the update to clear only one PDE entry or clear PTB, to avoid unmap to free PTE bo. This fixes the race bug and improve the unmap and map to GPU performance. Update mapping to huge page will still free the PTB bo. With this change, the vm->pt_freed list and work is not needed. Add WARN_ON(unlocked) in amdgpu_vm_pt_free_dfs to catch if unmap to free the PTB. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c| 4 --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h| 4 --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 43 ++- 3 files changed, 10 insertions(+), 41 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 82e5fd66a10d..3bde77dfc63f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2256,8 +2256,6 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, spin_lock_init(>status_lock); INIT_LIST_HEAD(>freed); INIT_LIST_HEAD(>done); - INIT_LIST_HEAD(>pt_freed); - INIT_WORK(>pt_free_work, amdgpu_vm_pt_free_work); INIT_KFIFO(vm->faults); r = amdgpu_vm_init_entities(adev, vm); @@ -2446,8 +2444,6 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm) amdgpu_amdkfd_gpuvm_destroy_cb(adev, vm); - flush_work(>pt_free_work); - root = amdgpu_bo_ref(vm->root.bo); amdgpu_bo_reserve(root, true); amdgpu_vm_set_pasid(adev, vm, 0); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index cdb61f1e7c35..74fe211b9ecd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -316,10 +316,6 @@ struct amdgpu_vm { /* BOs which are invalidated, has been updated in the PTs */ struct list_headdone; - /* PT BOs scheduled to free and fill with zero if vm_resv is not hold */ - struct list_headpt_freed; - struct work_struct pt_free_work; - /* contains the page directory */ struct amdgpu_vm_bo_base root; struct dma_fence*last_update; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c index a160265ddc07..a3d609655ce3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c @@ -657,27 +657,6 @@ static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base *entry) amdgpu_bo_unref(>bo); } -void amdgpu_vm_pt_free_work(struct work_struct *work) -{ - struct amdgpu_vm_bo_base *entry, *next; - struct amdgpu_vm *vm; - LIST_HEAD(pt_freed); - - vm = container_of(work, struct amdgpu_vm, pt_free_work); - - spin_lock(>status_lock); - list_splice_init(>pt_freed, _freed); - spin_unlock(>status_lock); - - /* flush_work in amdgpu_vm_fini ensure vm->root.bo is valid. */ - amdgpu_bo_reserve(vm->root.bo, true); - - list_for_each_entry_safe(entry, next, _freed, vm_status) - amdgpu_vm_pt_free(entry); - - amdgpu_bo_unreserve(vm->root.bo); -} - /** * amdgpu_vm_pt_free_dfs - free PD/PT levels * @@ -696,17 +675,7 @@ static void amdgpu_vm_pt_free_dfs(struct amdgpu_device *adev, struct amdgpu_vm_pt_cursor cursor; struct amdgpu_vm_bo_base *entry; - if (unlocked) { - spin_lock(>status_lock); - for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry) - list_move(>vm_status, >pt_freed); - - if (start) - list_move(>entry->vm_status, >pt_freed); - spin_unlock(>status_lock); - schedule_work(>pt_free_work); - return; - } + WARN_ON(unlocked); for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry) amdgpu_vm_pt_free(entry); @@ -1009,7 +978,15 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params, incr = (uint64_t)AMDGPU_GPU_PAGE_SIZE << shift; mask = amdgpu_vm_pt_entries_mask(adev, cursor.level); pe_start = ((cursor.pfn >> shift) & mask) * 8; - entry_end = ((uint64_t)mask + 1) << shift; + + if (cursor.level < AMDGPU_VM_PTB && params->unlocked) + /* +* Unmap to clear o
Re: [PATCH] drm/amdgpu: Support >=4GB GTT memory mapping
On 2024-01-29 11:30, Christian König wrote: Am 29.01.24 um 17:25 schrieb Philip Yang: On 2024-01-29 05:06, Christian König wrote: Am 26.01.24 um 20:47 schrieb Philip Yang: This is to work around a bug in function drm_prime_pages_to_sg if length of nr_pages >= 4GB, by doing the same check for max_segment and then calling sg_alloc_table_from_pages_segment directly instead. This issue shows up on APU because VRAM is allocated as GTT memory. It also fixes >=4GB GTT memory mapping for mGPUs with IOMMU isolation mode. Well that was talked about before and rejected. If we really want more than 4GiB in DMA-bufs we need to fix drm_prime_pages_to_sg() instead. I sent a patch to fix drm_prime_pages_to_sg but the patch was rejected. Why was that rejected? If this isn't something we want for DRM we probably don't want it for AMDGPU either. The reason is same as your concern, to check if we want more than 4GB dmabuf support and may need fix for other drm functions. I am not familiar with drm layer, amdgpu need more than 4GB dmabuf on mGPUs APU. Do you want me to resend that drm patch to fix only drm_prime_pages_to_sg function? anything like this will cause size becomes 0 if nr_pages size is more than 4GB: unsigned int nr_pages; unsigned long size = nr_pages << PAGE_SHIFT; Regards, Philip This issue happens on APU, as VRAM is allocated as GTT memory, get to this patch only if IOMMU is isolation mode, with IOMMU off or pt mode, multiple GPUs share the same dma mapping. Even with the fix patch accepted by drm, we still need this patch to workaround the issue on old kernel version. Yeah, but that's then just a functions fixup for our backporting team and shouldn't be worked around like this. Regards, Christian. Regards, Philip Regards, Christian. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c | 50 ++--- 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c index 055ba2ea4c12..a203633fd629 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c @@ -171,18 +171,41 @@ static struct sg_table *amdgpu_dma_buf_map(struct dma_buf_attachment *attach, } switch (bo->tbo.resource->mem_type) { - case TTM_PL_TT: - sgt = drm_prime_pages_to_sg(obj->dev, - bo->tbo.ttm->pages, - bo->tbo.ttm->num_pages); - if (IS_ERR(sgt)) - return sgt; - - if (dma_map_sgtable(attach->dev, sgt, dir, - DMA_ATTR_SKIP_CPU_SYNC)) - goto error_free; - break; + case TTM_PL_TT: { + size_t max_segment = 0; + u64 num_pages; + int err; + + sgt = kmalloc(sizeof(*sgt), GFP_KERNEL); + if (!sgt) + return ERR_PTR(-ENOMEM); + + if (obj->dev) + max_segment = dma_max_mapping_size(obj->dev->dev); + if (max_segment == 0) + max_segment = UINT_MAX; + + /* + * Use u64, otherwise if length of num_pages >= 4GB then size
Re: [PATCH] drm/amdgpu: Support >=4GB GTT memory mapping
On 2024-01-29 05:06, Christian König wrote: Am 26.01.24 um 20:47 schrieb Philip Yang: This is to work around a bug in function drm_prime_pages_to_sg if length of nr_pages >= 4GB, by doing the same check for max_segment and then calling sg_alloc_table_from_pages_segment directly instead. This issue shows up on APU because VRAM is allocated as GTT memory. It also fixes >=4GB GTT memory mapping for mGPUs with IOMMU isolation mode. Well that was talked about before and rejected. If we really want more than 4GiB in DMA-bufs we need to fix drm_prime_pages_to_sg() instead. I sent a patch to fix drm_prime_pages_to_sg but the patch was rejected. This issue happens on APU, as VRAM is allocated as GTT memory, get to this patch only if IOMMU is isolation mode, with IOMMU off or pt mode, multiple GPUs share the same dma mapping. Even with the fix patch accepted by drm, we still need this patch to workaround the issue on old kernel version. Regards, Philip Regards, Christian. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c | 50 ++--- 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c index 055ba2ea4c12..a203633fd629 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c @@ -171,18 +171,41 @@ static struct sg_table *amdgpu_dma_buf_map(struct dma_buf_attachment *attach, } switch (bo->tbo.resource->mem_type) { - case TTM_PL_TT: - sgt = drm_prime_pages_to_sg(obj->dev, - bo->tbo.ttm->pages, - bo->tbo.ttm->num_pages); - if (IS_ERR(sgt)) - return sgt; - - if (dma_map_sgtable(attach->dev, sgt, dir, - DMA_ATTR_SKIP_CPU_SYNC)) - goto error_free; - break; + case TTM_PL_TT: { + size_t max_segment = 0; + u64 num_pages; + int err; + + sgt = kmalloc(sizeof(*sgt), GFP_KERNEL); + if (!sgt) + return ERR_PTR(-ENOMEM); + + if (obj->dev) + max_segment = dma_max_mapping_size(obj->dev->dev); + if (max_segment == 0) + max_segment = UINT_MAX; + + /* + * Use u64, otherwise if length of num_pages >= 4GB then size + * (num_pages << PAGE_SHIFT) becomes 0 + */ + num_pages = bo->tbo.ttm->num_pages; + err = sg_alloc_table_from_pages_segment(sgt, bo->tbo.ttm->pages, + num_pages, 0, + num_pages << PAGE_SHIFT, + max_segment, GFP_KERNEL); + if (err) { + kfree(sgt); + return ERR_PTR(err); + } + if (dma_map_sgtable(attach->dev, sgt, dir, DMA_ATTR_SKIP_CPU_SYNC)) { + sg_free_table(sgt); + kfree(sgt); + return ERR_PTR(-EBUSY); + } + break; + } case TTM_PL_VRAM: r = amdgpu_vram_mgr_alloc_sgt(adev, bo->tbo.resource, 0, bo->tbo.base.size, attach->dev, @@ -195,11 +218,6 @@ static struct sg_table *amdgpu_dma_buf_map(struct dma_buf_attachment *attach, } return sgt; - -error_free: - sg_free_table(sgt); - kfree(sgt); - return ERR_PTR(-EBUSY); } /**
[PATCH] drm/amdgpu: Support >=4GB GTT memory mapping
This is to work around a bug in function drm_prime_pages_to_sg if length of nr_pages >= 4GB, by doing the same check for max_segment and then calling sg_alloc_table_from_pages_segment directly instead. This issue shows up on APU because VRAM is allocated as GTT memory. It also fixes >=4GB GTT memory mapping for mGPUs with IOMMU isolation mode. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c | 50 ++--- 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c index 055ba2ea4c12..a203633fd629 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c @@ -171,18 +171,41 @@ static struct sg_table *amdgpu_dma_buf_map(struct dma_buf_attachment *attach, } switch (bo->tbo.resource->mem_type) { - case TTM_PL_TT: - sgt = drm_prime_pages_to_sg(obj->dev, - bo->tbo.ttm->pages, - bo->tbo.ttm->num_pages); - if (IS_ERR(sgt)) - return sgt; - - if (dma_map_sgtable(attach->dev, sgt, dir, - DMA_ATTR_SKIP_CPU_SYNC)) - goto error_free; - break; + case TTM_PL_TT: { + size_t max_segment = 0; + u64 num_pages; + int err; + + sgt = kmalloc(sizeof(*sgt), GFP_KERNEL); + if (!sgt) + return ERR_PTR(-ENOMEM); + + if (obj->dev) + max_segment = dma_max_mapping_size(obj->dev->dev); + if (max_segment == 0) + max_segment = UINT_MAX; + + /* +* Use u64, otherwise if length of num_pages >= 4GB then size +* (num_pages << PAGE_SHIFT) becomes 0 +*/ + num_pages = bo->tbo.ttm->num_pages; + err = sg_alloc_table_from_pages_segment(sgt, bo->tbo.ttm->pages, + num_pages, 0, + num_pages << PAGE_SHIFT, + max_segment, GFP_KERNEL); + if (err) { + kfree(sgt); + return ERR_PTR(err); + } + if (dma_map_sgtable(attach->dev, sgt, dir, DMA_ATTR_SKIP_CPU_SYNC)) { + sg_free_table(sgt); + kfree(sgt); + return ERR_PTR(-EBUSY); + } + break; + } case TTM_PL_VRAM: r = amdgpu_vram_mgr_alloc_sgt(adev, bo->tbo.resource, 0, bo->tbo.base.size, attach->dev, @@ -195,11 +218,6 @@ static struct sg_table *amdgpu_dma_buf_map(struct dma_buf_attachment *attach, } return sgt; - -error_free: - sg_free_table(sgt); - kfree(sgt); - return ERR_PTR(-EBUSY); } /** -- 2.35.1
Re: [PATCH] drm/amdgpu: Limit the maximum fragment to granularity size
On 2024-01-26 10:35, Christian König wrote: Am 26.01.24 um 16:17 schrieb Philip Yang: On 2024-01-26 09:59, Christian König wrote: Am 26.01.24 um 15:38 schrieb Philip Yang: svm range support partial migration and mapping update, for size 4MB virtual address 4MB alignment and physical address continuous range, if mapping to GPU with fs=10, after updating mapping of the first 2MB, if the second 2MB mapping fs=10 in cache TLB, this causes the first 2MB access to the stale mapping. Well that sounds fishy. When that happens with (for example) 4MiB and 2MiB, why doesn't it happen with 8KiB and 4KiB as well? unmap svm range is aligned to granularity size, if the range size is 8KB (all within one 2MB granularity range), it will be mapped/unmapped as 8KB, even if only 4KB is migrated. This is handled in another patch series "amd/amdkfd: Unmap range from GPU based on granularity". Ok that makes a bit more sense. But when you have a linear 4MiB mapping and unmap the first 2MiB of it you need to flush the TLB anyway. So why would that cause a stale access? yes, unmap does flush the TLB, the issue happens if GPU access the second 2MB to load fs=10 entry to TLB, and then access the first 2MB. Originally I thought this could be fixed by using granularity aligned address, size to map/unmap to GPU, after debugging, realize we still need limit the max fragment size. We could change this in svm map function, but it is more efficient to pass the max fragment size to GPU page table update level. Regards, Philip Regards, Christian. Regards, Philip Christian. Limit the maximum fragment size to granularity size, 2MB by default, with the mapping and unmapping based on gramularity size, to solve this issue. The change is only for SVM map/unmap range, no change for gfx and legacy API path. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 12 +++- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 4 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 22 -- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 9 + 4 files changed, 26 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index ed4a8c5d26d7..a2bef94cb959 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -897,6 +897,7 @@ static void amdgpu_vm_tlb_seq_cb(struct dma_fence *fence, * @res: ttm_resource to map * @pages_addr: DMA addresses to use for mapping * @fence: optional resulting fence + * @frag_size: max map fragment size * * Fill in the page table entries between @start and @last. * @@ -908,7 +909,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, struct dma_resv *resv, uint64_t start, uint64_t last, uint64_t flags, uint64_t offset, uint64_t vram_base, struct ttm_resource *res, dma_addr_t *pages_addr, - struct dma_fence **fence) + struct dma_fence **fence, unsigned int frag_size) { struct amdgpu_vm_update_params params; struct amdgpu_vm_tlb_seq_struct *tlb_cb; @@ -1016,7 +1017,7 @@ int amdgpu_vm_update_range(struct amdgpu_de
Re: [PATCH] drm/amdgpu: Limit the maximum fragment to granularity size
On 2024-01-26 09:59, Christian König wrote: Am 26.01.24 um 15:38 schrieb Philip Yang: svm range support partial migration and mapping update, for size 4MB virtual address 4MB alignment and physical address continuous range, if mapping to GPU with fs=10, after updating mapping of the first 2MB, if the second 2MB mapping fs=10 in cache TLB, this causes the first 2MB access to the stale mapping. Well that sounds fishy. When that happens with (for example) 4MiB and 2MiB, why doesn't it happen with 8KiB and 4KiB as well? unmap svm range is aligned to granularity size, if the range size is 8KB (all within one 2MB granularity range), it will be mapped/unmapped as 8KB, even if only 4KB is migrated. This is handled in another patch series "amd/amdkfd: Unmap range from GPU based on granularity". Regards, Philip Christian. Limit the maximum fragment size to granularity size, 2MB by default, with the mapping and unmapping based on gramularity size, to solve this issue. The change is only for SVM map/unmap range, no change for gfx and legacy API path. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 12 +++- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 4 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 22 -- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 9 + 4 files changed, 26 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index ed4a8c5d26d7..a2bef94cb959 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -897,6 +897,7 @@ static void amdgpu_vm_tlb_seq_cb(struct dma_fence *fence, * @res: ttm_resource to map * @pages_addr: DMA addresses to use for mapping * @fence: optional resulting fence + * @frag_size: max map fragment size * * Fill in the page table entries between @start and @last. * @@ -908,7 +909,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, struct dma_resv *resv, uint64_t start, uint64_t last, uint64_t flags, uint64_t offset, uint64_t vram_base, struct ttm_resource *res, dma_addr_t *pages_addr, - struct dma_fence **fence) + struct dma_fence **fence, unsigned int frag_size) { struct amdgpu_vm_update_params params; struct amdgpu_vm_tlb_seq_struct *tlb_cb; @@ -1016,7 +1017,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, } tmp = start + num_entries; - r = amdgpu_vm_ptes_update(, start, tmp, addr, flags); + r = amdgpu_vm_ptes_update(, start, tmp, addr, flags, frag_size); if (r) goto error_free; @@ -1197,7 +1198,7 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va, !uncached, resv, mapping->start, mapping->last, update_flags, mapping->offset, vram_base, mem, pages_addr, - last_update); + last_update, 0); if (r) return r; } @@ -1392,7 +1393,7 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev, r = amdgpu_vm_update_range(adev, vm, false, false, true, false, resv, mapping->start, mapping->last, init_pte_value, 0, 0, NULL, NULL, - ); + , 0); amdgpu_vm_free_map
[PATCH] drm/amdgpu: Limit the maximum fragment to granularity size
svm range support partial migration and mapping update, for size 4MB virtual address 4MB alignment and physical address continuous range, if mapping to GPU with fs=10, after updating mapping of the first 2MB, if the second 2MB mapping fs=10 in cache TLB, this causes the first 2MB access to the stale mapping. Limit the maximum fragment size to granularity size, 2MB by default, with the mapping and unmapping based on gramularity size, to solve this issue. The change is only for SVM map/unmap range, no change for gfx and legacy API path. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c| 12 +++- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h| 4 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 22 -- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 9 + 4 files changed, 26 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index ed4a8c5d26d7..a2bef94cb959 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -897,6 +897,7 @@ static void amdgpu_vm_tlb_seq_cb(struct dma_fence *fence, * @res: ttm_resource to map * @pages_addr: DMA addresses to use for mapping * @fence: optional resulting fence + * @frag_size: max map fragment size * * Fill in the page table entries between @start and @last. * @@ -908,7 +909,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, struct dma_resv *resv, uint64_t start, uint64_t last, uint64_t flags, uint64_t offset, uint64_t vram_base, struct ttm_resource *res, dma_addr_t *pages_addr, - struct dma_fence **fence) + struct dma_fence **fence, unsigned int frag_size) { struct amdgpu_vm_update_params params; struct amdgpu_vm_tlb_seq_struct *tlb_cb; @@ -1016,7 +1017,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, } tmp = start + num_entries; - r = amdgpu_vm_ptes_update(, start, tmp, addr, flags); + r = amdgpu_vm_ptes_update(, start, tmp, addr, flags, frag_size); if (r) goto error_free; @@ -1197,7 +1198,7 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va, !uncached, resv, mapping->start, mapping->last, update_flags, mapping->offset, vram_base, mem, pages_addr, - last_update); + last_update, 0); if (r) return r; } @@ -1392,7 +1393,7 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev, r = amdgpu_vm_update_range(adev, vm, false, false, true, false, resv, mapping->start, mapping->last, init_pte_value, 0, 0, NULL, NULL, - ); + , 0); amdgpu_vm_free_mapping(adev, vm, mapping, f); if (r) { dma_fence_put(f); @@ -2733,7 +2734,8 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, } r = amdgpu_vm_update_range(adev, vm, true, false, false, false, - NULL, addr, addr, flags, value, 0, NULL, NULL, NULL); + NULL, addr, addr, flags, value, 0, NULL, NULL, + NULL, 0); if (r) goto error_unlock; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 98a57192..b34466b5086f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -465,7 +465,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, struct dma_resv *resv, uint64_t start, uint64_t last, uint64_t flags, uint64_t offset, uint64_t vram_base, struct ttm_resource *res, dma_addr_t *pages_addr, - struct dma_fence **fence); + struct dma_fence **fence, unsigned int frag_size); int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va, bool clear); @@ -531,7 +531,7 @@ int amdgpu_vm_pde_update(struct amdgpu_vm_update_params *params, struct amdgpu_vm_bo_base *entry); int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params, uint64_t start, uint64_t end, -
[PATCH v4 7/7] drm/amdkfd: Wait update sdma fence before tlb flush
If using sdma update GPU page table, kfd flush tlb does nothing if vm update fence callback doesn't update vm->tlb_seq. This works now because retry fault will come and update page table again and flush tlb finally. With the bitmap_map flag, the retry fault recover will only update GPU page table once, have to wait sdma udate fence and then flush tlb. No change if using CPU update GPU page table for large bar because no vm update fence. Remove wait parameter in svm_range_validate_and_map because it is always called with true now. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 17 - 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index b36d997e7a3d..9e5f6e12c498 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -1677,7 +1677,7 @@ svm_range_map_to_gpu(struct kfd_process_device *pdd, struct svm_range *prange, static int svm_range_map_to_gpus(struct svm_range *prange, unsigned long offset, unsigned long npages, bool readonly, - unsigned long *bitmap, bool wait, bool flush_tlb) + unsigned long *bitmap, bool flush_tlb) { struct kfd_process_device *pdd; struct amdgpu_device *bo_adev = NULL; @@ -1710,8 +1710,7 @@ svm_range_map_to_gpus(struct svm_range *prange, unsigned long offset, r = svm_range_map_to_gpu(pdd, prange, offset, npages, readonly, prange->dma_addr[gpuidx], -bo_adev, wait ? : NULL, -flush_tlb); +bo_adev, , flush_tlb); if (r) break; @@ -1837,7 +1836,7 @@ static void *kfd_svm_page_owner(struct kfd_process *p, int32_t gpuidx) static int svm_range_validate_and_map(struct mm_struct *mm, unsigned long map_start, unsigned long map_last, struct svm_range *prange, int32_t gpuidx, - bool intr, bool wait, bool flush_tlb) + bool intr, bool flush_tlb) { struct svm_validate_context *ctx; unsigned long start, end, addr; @@ -1950,7 +1949,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm, offset = map_start_vma - prange->start; npages = map_last_vma - map_start_vma + 1; r = svm_range_map_to_gpus(prange, offset, npages, readonly, - ctx->bitmap, wait, flush_tlb); + ctx->bitmap, flush_tlb); } } @@ -2041,7 +2040,7 @@ static void svm_range_restore_work(struct work_struct *work) mutex_lock(>migrate_mutex); r = svm_range_validate_and_map(mm, prange->start, prange->last, prange, - MAX_GPU_INSTANCE, false, true, false); + MAX_GPU_INSTANCE, false, false); if (r) pr_debug("failed %d to map 0x%lx to gpus\n", r, prange->start); @@ -3303,7 +3302,7 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, mmap_read_lock(mm); r = svm_range_validate_and_map(mm, start, last, prange, gpuidx, false, - false, false); + false); if (r) pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n", r, svms, start, last); @@ -3847,7 +3846,7 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm, flush_tlb = !migrated && update_mapping && svm_range_partial_mapped(prange, prange->start, prange->last); r = svm_range_validate_and_map(mm, prange->start, prange->last, prange, - MAX_GPU_INSTANCE, true, true, flush_tlb); + MAX_GPU_INSTANCE, true, flush_tlb); if (r) pr_debug("failed %d to map svm range\n", r); @@ -3863,7 +3862,7 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm, mutex_lock(>migrate_mutex); flush_tlb = svm_range_partial_mapped(prange, prange->start, prange->last); r = svm_range_validate_and_map(mm, prange->start, prange->last, prange, - MAX_GPU_INSTANCE, true,
[PATCH v4 2/7] drm/amdkfd: Add helper function align range start last
Calculate range start, last address aligned to the range granularity size. This removes the duplicate code, and the helper function will be used in the future patch to handle map, unmap to GPU based on range granularity. No functional change. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 4 ++-- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 10 -- drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 10 ++ 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index dae05f70257b..64eb9023d66b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -986,8 +986,8 @@ static vm_fault_t svm_migrate_to_ram(struct vm_fault *vmf) /* Align migration range start and size to granularity size */ size = 1UL << prange->granularity; - start = max(ALIGN_DOWN(addr, size), prange->start); - last = min(ALIGN(addr + 1, size) - 1, prange->last); + start = svm_range_align_start(addr, prange->start, size); + last = svm_range_align_last(addr, prange->last, size); r = svm_migrate_vram_to_ram(prange, vmf->vma->vm_mm, start, last, KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU, vmf->page); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 14dbc0fd51a9..a2c96f5760ff 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -2698,10 +2698,8 @@ svm_range_get_range_boundaries(struct kfd_process *p, int64_t addr, (vma->vm_start <= vma->vm_mm->start_stack && vma->vm_end >= vma->vm_mm->start_stack); - start_limit = max(vma->vm_start >> PAGE_SHIFT, - (unsigned long)ALIGN_DOWN(addr, 2UL << 8)); - end_limit = min(vma->vm_end >> PAGE_SHIFT, - (unsigned long)ALIGN(addr + 1, 2UL << 8)); + start_limit = svm_range_align_start(addr, vma->vm_start >> PAGE_SHIFT, 2UL << 8); + end_limit = svm_range_align_last(addr, (vma->vm_end >> PAGE_SHIFT) - 1, 2UL << 8) + 1; /* First range that starts after the fault address */ node = interval_tree_iter_first(>svms.objects, addr + 1, ULONG_MAX); if (node) { @@ -3043,8 +3041,8 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, /* Align migration range start and size to granularity size */ size = 1UL << prange->granularity; - start = max_t(unsigned long, ALIGN_DOWN(addr, size), prange->start); - last = min_t(unsigned long, ALIGN(addr + 1, size) - 1, prange->last); + start = svm_range_align_start(addr, prange->start, size); + last = svm_range_align_last(addr, prange->last, size); if (prange->actual_loc != 0 || best_loc != 0) { migration = true; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h index 026863a0abcd..806bcac6d101 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h @@ -159,6 +159,16 @@ static inline struct svm_range_bo *svm_range_bo_ref(struct svm_range_bo *svm_bo) return svm_bo; } +static inline u64 svm_range_align_start(u64 addr, u64 range_start, u64 align_size) +{ + return max(ALIGN_DOWN(addr, align_size), range_start); +} + +static inline u64 svm_range_align_last(u64 addr, u64 range_last, u64 align_size) +{ + return min(ALIGN(addr + 1, align_size) - 1, range_last); +} + int svm_range_list_init(struct kfd_process *p); void svm_range_list_fini(struct kfd_process *p); int svm_ioctl(struct kfd_process *p, enum kfd_ioctl_svm_op op, uint64_t start, -- 2.35.1
[PATCH v4 5/7] drm/amdkfd: Change range granularity update bitmap_map
When changing the svm range granularity, update the svm range bitmap_map based on new range granularity. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 49 +++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 7a30c3e58234..ebc4cce801bf 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -757,6 +757,53 @@ svm_range_check_attr(struct kfd_process *p, return 0; } +static void +svm_range_change_granularity(struct svm_range *prange, u8 value) +{ + struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms); + u32 new_nbits, old_nbits, i, n; + unsigned long *new_bits, *old_bits; + u32 gpuidx; + + if (prange->granularity == value) + return; + + old_nbits = svm_range_map_nbits(prange->start, prange->last, prange->granularity); + new_nbits = svm_range_map_nbits(prange->start, prange->last, value); + if (new_nbits > old_nbits) { + n = new_nbits / old_nbits; + if (new_nbits % old_nbits) + n++; + } else { + n = old_nbits / new_nbits; + if (old_nbits % new_nbits) + n++; + } + + pr_debug("prange 0x%p [0x%lx 0x%lx] bitmap_map nbits %d -> %d\n", +prange, prange->start, prange->last, old_nbits, new_nbits); + + for_each_set_bit(gpuidx, p->svms.bitmap_supported, p->n_pdds) { + old_bits = prange->bitmap_map[gpuidx]; + if (bitmap_empty(old_bits, old_nbits)) + continue; + + new_bits = bitmap_zalloc(new_nbits, GFP_KERNEL); + if (!new_bits) + return; + + for_each_set_bit(i, old_bits, old_nbits) { + if (new_nbits > old_nbits) + bitmap_set(new_bits, i * n, n); + else + bitmap_set(new_bits, i / n, 1); + } + prange->bitmap_map[gpuidx] = new_bits; + bitmap_free(old_bits); + } + prange->granularity = value; +} + static void svm_range_apply_attrs(struct kfd_process *p, struct svm_range *prange, uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs, @@ -801,7 +848,7 @@ svm_range_apply_attrs(struct kfd_process *p, struct svm_range *prange, prange->flags &= ~attrs[i].value; break; case KFD_IOCTL_SVM_ATTR_GRANULARITY: - prange->granularity = min_t(uint32_t, attrs[i].value, 0x3F); + svm_range_change_granularity(prange, min_t(u32, attrs[i].value, 0x3F)); break; default: WARN_ONCE(1, "svm_range_check_attrs wasn't called?"); -- 2.35.1
[PATCH v4 4/7] amd/amdkfd: Unmap range from GPU based on granularity
When MMU notifier invalidate the range, align the start and last address to range granularity to unmap from GPU and update bitmap_map flag. Skip unmap from GPU if range is already unmapped based on bitmap_map flag. This avoids unmap 1 page from GPU and flush TLB, also solve the rocgdb CWSR migration related issue. Unmap the range from cpu will remove the range and split the range, this cannot align the start and last address to range granularity. Change to split the range and bitmap_map flag first, then unmap the range from GPU. If unmapping from GPU first, the bitmap_map flag is updated, split range may get incorrect bitmap_map for the remaining ranges. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 42 +++- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index a003406db067..7a30c3e58234 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -2114,6 +2114,13 @@ svm_range_evict(struct svm_range *prange, struct mm_struct *mm, } else { unsigned long s, l; uint32_t trigger; + u64 size = 1UL << prange->granularity; + + if (!svm_range_partial_mapped(prange, start, last)) { + pr_debug("svms 0x%p [0x%lx 0x%lx] unmapped already\n", +prange->svms, start, last); + return 0; + } if (event == MMU_NOTIFY_MIGRATE) trigger = KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE; @@ -2122,16 +2129,17 @@ svm_range_evict(struct svm_range *prange, struct mm_struct *mm, pr_debug("invalidate unmap svms 0x%p [0x%lx 0x%lx] from GPUs\n", prange->svms, start, last); + list_for_each_entry(pchild, >child_list, child_list) { mutex_lock_nested(>lock, 1); - s = max(start, pchild->start); - l = min(last, pchild->last); + s = svm_range_align_start(start, pchild->start, size); + l = svm_range_align_last(last, pchild->last, size); if (l >= s) svm_range_unmap_from_gpus(pchild, s, l, trigger); mutex_unlock(>lock); } - s = max(start, prange->start); - l = min(last, prange->last); + s = svm_range_align_start(start, prange->start, size); + l = svm_range_align_last(last, prange->last, size); if (l >= s) svm_range_unmap_from_gpus(prange, s, l, trigger); } @@ -2645,24 +2653,32 @@ svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange, list_for_each_entry(pchild, >child_list, child_list) { mutex_lock_nested(>lock, 1); - s = max(start, pchild->start); - l = min(last, pchild->last); - if (l >= s) - svm_range_unmap_from_gpus(pchild, s, l, trigger); svm_range_unmap_split(mm, prange, pchild, start, last); mutex_unlock(>lock); } - s = max(start, prange->start); - l = min(last, prange->last); - if (l >= s) - svm_range_unmap_from_gpus(prange, s, l, trigger); svm_range_unmap_split(mm, prange, prange, start, last); - if (unmap_parent) svm_range_add_list_work(svms, prange, mm, SVM_OP_UNMAP_RANGE); else svm_range_add_list_work(svms, prange, mm, SVM_OP_UPDATE_RANGE_NOTIFIER); + + list_for_each_entry(pchild, >child_list, child_list) { + if (pchild->work_item.op != SVM_OP_UNMAP_RANGE) + continue; + + s = max(start, pchild->start); + l = min(last, pchild->last); + if (l >= s) + svm_range_unmap_from_gpus(pchild, s, l, trigger); + } + if (prange->work_item.op == SVM_OP_UNMAP_RANGE) { + s = max(start, prange->start); + l = min(last, prange->last); + if (l >= s) + svm_range_unmap_from_gpus(prange, s, l, trigger); + } + schedule_deferred_list_work(svms); kfd_unref_process(p); -- 2.35.1
[PATCH v4 6/7] drm/amdkfd: Check bitmap_map flag to skip retry fault
Remove prange validate_timestamp which is not accurate for multiple GPUs. Use the bitmap_map flag to skip the retry fault from different pages of the same granularity range if the granularity range is already mapped on the specific GPU. Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling --- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 24 drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 1 - 2 files changed, 8 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index ebc4cce801bf..b36d997e7a3d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -45,10 +45,6 @@ #define AMDGPU_SVM_RANGE_RESTORE_DELAY_MS 1 -/* Long enough to ensure no retry fault comes after svm range is restored and - * page table is updated. - */ -#define AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING (2UL * NSEC_PER_MSEC) #if IS_ENABLED(CONFIG_DYNAMIC_DEBUG) #define dynamic_svm_range_dump(svms) \ _dynamic_func_call_no_desc("svm_range_dump", svm_range_debug_dump, svms) @@ -380,7 +376,6 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start, INIT_LIST_HEAD(>deferred_list); INIT_LIST_HEAD(>child_list); atomic_set(>invalid, 0); - prange->validate_timestamp = 0; mutex_init(>migrate_mutex); mutex_init(>lock); @@ -1965,8 +1960,6 @@ static int svm_range_validate_and_map(struct mm_struct *mm, } svm_range_unreserve_bos(ctx); - if (!r) - prange->validate_timestamp = ktime_get_boottime(); free_ctx: kfree(ctx); @@ -3226,15 +3219,6 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, goto out_unlock_mm; } - /* skip duplicate vm fault on different pages of same range */ - if (ktime_before(timestamp, ktime_add_ns(prange->validate_timestamp, - AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING))) { - pr_debug("svms 0x%p [0x%lx %lx] already restored\n", -svms, prange->start, prange->last); - r = 0; - goto out_unlock_mm; - } - /* __do_munmap removed VMA, return success as we are handling stale * retry fault. */ @@ -3260,6 +3244,14 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, goto out_unlock_mm; } + /* skip duplicate vm fault on different pages of same granularity range */ + if (svm_range_partial_mapped_dev(gpuidx, prange, addr, addr)) { + pr_debug("svms 0x%p [0x%lx %lx] addr 0x%llx already mapped on gpu %d\n", +svms, prange->start, prange->last, addr, gpuidx); + r = 0; + goto out_unlock_mm; + } + mutex_lock(>migrate_mutex); pr_debug("svms %p [0x%lx 0x%lx] best restore 0x%x, actual loc 0x%x\n", diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h index a10eeb77f83e..5a9688d5c18c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h @@ -129,7 +129,6 @@ struct svm_range { uint32_tactual_loc; uint8_t granularity; atomic_tinvalid; - ktime_t validate_timestamp; struct mmu_interval_notifiernotifier; struct svm_work_list_item work_item; struct list_headdeferred_list; -- 2.35.1
[PATCH v4 3/7] drm/amdkfd: Add granularity size based bitmap map flag
Replace prange->mapped_to_gpu with prange->bitmap_map[], which is per GPU flag and use bitmap bits based on prange granularity. Align map to GPU or unmap from GPU range size to granularity size and update the corresponding bitmap_map flag bits. This will optimize multiple GPU map, unmap and retry fault recover. svm_range_partial_mapped is false only if no part of the range mapping on any GPUs. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 258 ++- drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 7 +- 2 files changed, 219 insertions(+), 46 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index a2c96f5760ff..a003406db067 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -307,12 +307,12 @@ static void svm_range_free(struct svm_range *prange, bool do_unmap) KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0); } - /* free dma_addr array for each gpu */ + /* free dma_addr array, bitmap_map for each gpu */ for (gpuidx = 0; gpuidx < MAX_GPU_INSTANCE; gpuidx++) { - if (prange->dma_addr[gpuidx]) { + if (prange->dma_addr[gpuidx]) kvfree(prange->dma_addr[gpuidx]); - prange->dma_addr[gpuidx] = NULL; - } + if (prange->bitmap_map[gpuidx]) + bitmap_free(prange->bitmap_map[gpuidx]); } mutex_destroy(>lock); @@ -338,19 +338,38 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start, uint64_t size = last - start + 1; struct svm_range *prange; struct kfd_process *p; - - prange = kzalloc(sizeof(*prange), GFP_KERNEL); - if (!prange) - return NULL; + unsigned int nbits; + u32 gpuidx; p = container_of(svms, struct kfd_process, svms); if (!p->xnack_enabled && update_mem_usage && amdgpu_amdkfd_reserve_mem_limit(NULL, size << PAGE_SHIFT, KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0)) { pr_info("SVM mapping failed, exceeds resident system memory limit\n"); - kfree(prange); return NULL; } + + prange = kzalloc(sizeof(*prange), GFP_KERNEL); + if (!prange) + return NULL; + + svm_range_set_default_attributes(>preferred_loc, +>prefetch_loc, +>granularity, >flags); + + nbits = svm_range_map_nbits(start, last, prange->granularity); + pr_debug("prange 0x%p [0x%llx 0x%llx] bitmap_map nbits %d\n", prange, +start, last, nbits); + for_each_set_bit(gpuidx, p->svms.bitmap_supported, p->n_pdds) { + prange->bitmap_map[gpuidx] = bitmap_zalloc(nbits, GFP_KERNEL); + if (!prange->bitmap_map[gpuidx]) { + while (gpuidx--) + bitmap_free(prange->bitmap_map[gpuidx]); + kfree(prange); + return NULL; + } + } + prange->npages = size; prange->svms = svms; prange->start = start; @@ -369,10 +388,6 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start, bitmap_copy(prange->bitmap_access, svms->bitmap_supported, MAX_GPU_INSTANCE); - svm_range_set_default_attributes(>preferred_loc, ->prefetch_loc, ->granularity, >flags); - pr_debug("svms 0x%p [0x%llx 0x%llx]\n", svms, start, last); return prange; @@ -1017,6 +1032,51 @@ svm_range_split_nodes(struct svm_range *new, struct svm_range *old, return 0; } +static int +svm_range_split_bitmap_map(struct svm_range *new, struct svm_range *old, + u64 start, u64 last) +{ + struct kfd_process *p = container_of(new->svms, struct kfd_process, svms); + u32 new_nbits, old_nbits, old_nbits2; + unsigned long *bits; + u32 gpuidx; + + new_nbits = svm_range_map_nbits(new->start, new->last, new->granularity); + old_nbits = svm_range_map_nbits(old->start, old->last, old->granularity); + old_nbits2 = svm_range_map_nbits(start, last, old->granularity); + + pr_debug("old 0x%p [0x%lx 0x%lx] => [0x%llx 0x%llx] nbits %d => %d\n", +old, old->start, old->last, start, last, old_nbits, old_nbits2); + pr_debug("new 0x%p [0x%lx 0x%lx] nbits %d\n", new, new->start, new->last, +new_nbits); + + for_each_set_bit(gpuidx, p-&g
[PATCH v4 1/7] drm/amdkfd: Add helper function svm_range_need_access_gpus
Add the helper function to get all GPUs bitmap that need access the svm range. This helper will be used in the following patch to check if prange is mapped to all gpus. Refactor svm_range_validate_and_map to use the helper function, no functional change. Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling --- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 74 1 file changed, 43 insertions(+), 31 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 18f8c82a849c..14dbc0fd51a9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -1169,6 +1169,44 @@ svm_range_add_child(struct svm_range *prange, struct mm_struct *mm, list_add_tail(>child_list, >child_list); } +static int +svm_range_need_access_gpus(unsigned long *bitmap, struct svm_range *prange) +{ + struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms); + u32 gpuidx; + + if (p->xnack_enabled) { + bitmap_copy(bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE); + + /* If prefetch range to GPU, or GPU retry fault migrate range to +* GPU, which has ACCESS attribute to the range, create mapping +* on that GPU. +*/ + if (prange->actual_loc) { + gpuidx = kfd_process_gpuidx_from_gpuid(p, prange->actual_loc); + if (gpuidx < 0) + return -EINVAL; + + if (test_bit(gpuidx, prange->bitmap_access)) + bitmap_set(bitmap, gpuidx, 1); + } + + /* +* If prange is already mapped or with always mapped flag, +* update mapping on GPUs with ACCESS attribute +*/ + if (bitmap_empty(bitmap, MAX_GPU_INSTANCE)) { + if (prange->mapped_to_gpu || + prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED) + bitmap_copy(bitmap, prange->bitmap_access, MAX_GPU_INSTANCE); + } + } else { + bitmap_or(bitmap, prange->bitmap_access, + prange->bitmap_aip, MAX_GPU_INSTANCE); + } + return 0; +} + static bool svm_nodes_in_same_hive(struct kfd_node *node_a, struct kfd_node *node_b) { @@ -1609,38 +1647,12 @@ static int svm_range_validate_and_map(struct mm_struct *mm, if (gpuidx < MAX_GPU_INSTANCE) { bitmap_zero(ctx->bitmap, MAX_GPU_INSTANCE); bitmap_set(ctx->bitmap, gpuidx, 1); - } else if (ctx->process->xnack_enabled) { - bitmap_copy(ctx->bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE); - - /* If prefetch range to GPU, or GPU retry fault migrate range to -* GPU, which has ACCESS attribute to the range, create mapping -* on that GPU. -*/ - if (prange->actual_loc) { - gpuidx = kfd_process_gpuidx_from_gpuid(ctx->process, - prange->actual_loc); - if (gpuidx < 0) { - WARN_ONCE(1, "failed get device by id 0x%x\n", -prange->actual_loc); - r = -EINVAL; - goto free_ctx; - } - if (test_bit(gpuidx, prange->bitmap_access)) - bitmap_set(ctx->bitmap, gpuidx, 1); - } - - /* -* If prange is already mapped or with always mapped flag, -* update mapping on GPUs with ACCESS attribute -*/ - if (bitmap_empty(ctx->bitmap, MAX_GPU_INSTANCE)) { - if (prange->mapped_to_gpu || - prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED) - bitmap_copy(ctx->bitmap, prange->bitmap_access, MAX_GPU_INSTANCE); - } } else { - bitmap_or(ctx->bitmap, prange->bitmap_access, - prange->bitmap_aip, MAX_GPU_INSTANCE); + r = svm_range_need_access_gpus(ctx->bitmap, prange); + if (r) { + WARN_ONCE(1, "failed get device by id 0x%x\n", prange->actual_loc); + goto free_ctx; + } } if (bitmap_empty(ctx->bitmap, MAX_GPU_INSTANCE)) { -- 2.35.1
[PATCH v4] drm/amdkfd: Set correct svm range actual loc after spliting
While svm range partial migrating to system memory, clear dma_addr vram domain flag, otherwise the future split will get incorrect vram_pages and actual loc. After range spliting, set new range and old range actual_loc: new range actual_loc is 0 if new->vram_pages is 0. old range actual_loc is 0 if old->vram_pages - new->vram_pages == 0. new range takes svm_bo ref only if vram_pages not equal to 0. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 8 + drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 42 ++-- drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 1 + 3 files changed, 41 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index bdc01ca9609a..79baa195ccac 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -564,6 +564,7 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct svm_range *prange, dma_addr_t *scratch, uint64_t npages) { struct device *dev = adev->dev; + dma_addr_t *dma_addr; uint64_t *src; dma_addr_t *dst; struct page *dpage; @@ -575,6 +576,7 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct svm_range *prange, prange->last); addr = migrate->start; + dma_addr = svm_get_dma_addr_for_page_count(prange, addr); src = (uint64_t *)(scratch + npages); dst = scratch; @@ -623,6 +625,12 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct svm_range *prange, goto out_oom; } + /* Clear VRAM flag when page is migrated to ram, to count vram +* pages correctly when spliting the range. +*/ + if (dma_addr && (dma_addr[i] & SVM_RANGE_VRAM_DOMAIN)) + dma_addr[i] = 0; + pr_debug_ratelimited("dma mapping dst to 0x%llx, pfn 0x%lx\n", dst[i] >> PAGE_SHIFT, page_to_pfn(dpage)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index f84547eccd28..78b4968e4c95 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -362,7 +362,6 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start, INIT_LIST_HEAD(>child_list); atomic_set(>invalid, 0); prange->validate_timestamp = 0; - prange->vram_pages = 0; mutex_init(>migrate_mutex); mutex_init(>lock); @@ -965,6 +964,24 @@ svm_range_split_array(void *ppnew, void *ppold, size_t size, return 0; } +dma_addr_t * +svm_get_dma_addr_for_page_count(struct svm_range *prange, u64 addr) +{ + struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms); + dma_addr_t *dma_addr; + s32 gpuidx; + + gpuidx = kfd_process_gpuidx_from_gpuid(p, prange->actual_loc); + if (gpuidx < 0) { + pr_debug("no GPU id 0x%x found\n", prange->actual_loc); + return NULL; + } + + dma_addr = prange->dma_addr[gpuidx]; + dma_addr += (addr >> PAGE_SHIFT) - prange->start; + return dma_addr; +} + static int svm_range_split_pages(struct svm_range *new, struct svm_range *old, uint64_t start, uint64_t last) @@ -980,9 +997,14 @@ svm_range_split_pages(struct svm_range *new, struct svm_range *old, if (r) return r; } - if (old->actual_loc) + if (old->actual_loc && new->vram_pages) { old->vram_pages -= new->vram_pages; - + new->actual_loc = old->actual_loc; + if (!old->vram_pages) + old->actual_loc = 0; + } + pr_debug("new->vram_pages 0x%llx loc 0x%x old->vram_pages 0x%llx loc 0x%x\n", +new->vram_pages, new->actual_loc, old->vram_pages, old->actual_loc); return 0; } @@ -1002,13 +1024,14 @@ svm_range_split_nodes(struct svm_range *new, struct svm_range *old, new->offset = old->offset + npages; } - new->svm_bo = svm_range_bo_ref(old->svm_bo); - new->ttm_res = old->ttm_res; - - spin_lock(>svm_bo->list_lock); - list_add(>svm_bo_list, >svm_bo->range_list); - spin_unlock(>svm_bo->list_lock); + if (new->vram_pages) { + new->svm_bo = svm_range_bo_ref(old->svm_bo); + new->ttm_res = old->ttm_res; + spin_lock(>svm_bo->list_lock); + list_add(>svm_bo_list, >svm_bo->range_list); + spin_unlock(>svm_bo->list_lock); + } return 0;
[PATCH] drm/amdkfd: Correct partial migration virtual addr
Partial migration to system memory should use migrate.addr, not prange->start as virtual address to allocate system memory page. Fixes: 18eb61bd5a6a ("drm/amdkfd: Use partial migrations/mapping for GPU/CPU page faults in SVM" Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index f856901055d3..bdc01ca9609a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -574,7 +574,7 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct svm_range *prange, pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms, prange->start, prange->last); - addr = prange->start << PAGE_SHIFT; + addr = migrate->start; src = (uint64_t *)(scratch + npages); dst = scratch; -- 2.35.1
Re: [PATCH v3] amd/amdkfd: Set correct svm range actual loc after spliting
On 2024-01-11 12:37, Chen, Xiaogang wrote: On 1/11/2024 10:54 AM, Felix Kuehling wrote: On 2024-01-10 17:01, Philip Yang wrote: While svm range partial migrating to system memory, clear dma_addr vram domain flag, otherwise the future split will get incorrect vram_pages and actual loc. After range spliting, set new range and old range actual_loc: new range actual_loc is 0 if new->vram_pages is 0. old range actual_loc is 0 if old->vram_pages - new->vram_pages == 0. new range takes svm_bo ref only if vram_pages not equal to 0. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 20 +++- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 24 ++-- 2 files changed, 33 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index f856901055d3..dae05f70257b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -563,18 +563,30 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct svm_range *prange, struct migrate_vma *migrate, struct dma_fence **mfence, dma_addr_t *scratch, uint64_t npages) { + struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms); struct device *dev = adev->dev; + dma_addr_t *dma_addr; uint64_t *src; dma_addr_t *dst; struct page *dpage; uint64_t i = 0, j; uint64_t addr; + s32 gpuidx; + u64 offset; int r = 0; pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms, prange->start, prange->last); - addr = prange->start << PAGE_SHIFT; Is this another bug fix for partial migration? If so, it may be worth making that a separate patch. Seems it is also a bug when prange is across multiple vma. With partial migration it become obvious. yes + gpuidx = kfd_process_gpuidx_from_gpuid(p, prange->actual_loc); + if (gpuidx < 0) { + pr_debug("no GPU id 0x%x found\n", prange->actual_loc); + return -EINVAL; + } + + addr = migrate->start; + offset = (addr >> PAGE_SHIFT) - prange->start; + dma_addr = prange->dma_addr[gpuidx]; src = "" *)(scratch + npages); dst = scratch; @@ -623,6 +635,12 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct svm_range *prange, goto out_oom; } + /* Clear VRAM flag when page is migrated to ram, to count vram + * pages correctly when spliting the range. + */ + if (dma_addr && (dma_addr[offset + i] & SVM_RANGE_VRAM_DOMAIN)) + dma_addr[offset + i] = 0; + When come here we already know the page has been moved to system ram, do we still need check dma_addr[offset + i] & SVM_RANGE_VRAM_DOMAIN) You want to set dma_addr[offset + i] = 0 anyway. I agree, dma_addr NULL and flag check is for safe purpose in case we may change dma_addr update after migration or prefetch later. I'm not a big fan of messing with the DMA arrays here, but I don't have a good alternative. I think what bothers me is, how the DMA address array and handling of vram page count is now spread out across so many places. It feels fragile.
Re: [PATCH v3] amd/amdkfd: Set correct svm range actual loc after spliting
On 2024-01-11 11:54, Felix Kuehling wrote: On 2024-01-10 17:01, Philip Yang wrote: While svm range partial migrating to system memory, clear dma_addr vram domain flag, otherwise the future split will get incorrect vram_pages and actual loc. After range spliting, set new range and old range actual_loc: new range actual_loc is 0 if new->vram_pages is 0. old range actual_loc is 0 if old->vram_pages - new->vram_pages == 0. new range takes svm_bo ref only if vram_pages not equal to 0. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 20 +++- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 24 ++-- 2 files changed, 33 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index f856901055d3..dae05f70257b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -563,18 +563,30 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct svm_range *prange, struct migrate_vma *migrate, struct dma_fence **mfence, dma_addr_t *scratch, uint64_t npages) { + struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms); struct device *dev = adev->dev; + dma_addr_t *dma_addr; uint64_t *src; dma_addr_t *dst; struct page *dpage; uint64_t i = 0, j; uint64_t addr; + s32 gpuidx; + u64 offset; int r = 0; pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms, prange->start, prange->last); - addr = prange->start << PAGE_SHIFT; Is this another bug fix for partial migration? If so, it may be worth making that a separate patch. yes, it is another bug I just noticed, the addr is passed to alloc system page along with migrate.vma, but addr is ignored for normal path, only used for shmem path, maybe it doesn't matter, I will put this into a separate patch anyway. + gpuidx = kfd_process_gpuidx_from_gpuid(p, prange->actual_loc); + if (gpuidx < 0) { + pr_debug("no GPU id 0x%x found\n", prange->actual_loc); + return -EINVAL; + } + + addr = migrate->start; + offset = (addr >> PAGE_SHIFT) - prange->start; + dma_addr = prange->dma_addr[gpuidx]; src = "" *)(scratch + npages); dst = scratch; @@ -623,6 +635,12 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct svm_range *prange, goto out_oom; } + /* Clear VRAM flag when page is migrated to ram, to count vram + * pages correctly when spliting the range. + */ + if (dma_addr && (dma_addr[offset + i] & SVM_RANGE_VRAM_DOMAIN)) + dma_addr[offset + i] = 0; + I'm not a big fan of messing with the DMA arrays here, but I don't have a good alternative. I think what bothers me is, how the DMA address array and handling of vram page count is now spread out across so many places. It feels fragile. Maybe it would be good to add a helper in kfd_svm.c: svm_get_dma_addr_for_page_count(prange, offset). That way you can keep the choice of gpuid and offset calculation in one place in kfd_svm.c, close to svm_range_copy_array. vram page counting is only used when spliting range, it is good idea to add helper and put close to svm range split and copy array, not put in the migration path. Regards, Philip Other than that, the patch looks good to me. Regards, Felix pr_debug_ratelimited("dma mapping dst to
[PATCH v3] amd/amdkfd: Set correct svm range actual loc after spliting
While svm range partial migrating to system memory, clear dma_addr vram domain flag, otherwise the future split will get incorrect vram_pages and actual loc. After range spliting, set new range and old range actual_loc: new range actual_loc is 0 if new->vram_pages is 0. old range actual_loc is 0 if old->vram_pages - new->vram_pages == 0. new range takes svm_bo ref only if vram_pages not equal to 0. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 20 +++- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 24 ++-- 2 files changed, 33 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index f856901055d3..dae05f70257b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -563,18 +563,30 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct svm_range *prange, struct migrate_vma *migrate, struct dma_fence **mfence, dma_addr_t *scratch, uint64_t npages) { + struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms); struct device *dev = adev->dev; + dma_addr_t *dma_addr; uint64_t *src; dma_addr_t *dst; struct page *dpage; uint64_t i = 0, j; uint64_t addr; + s32 gpuidx; + u64 offset; int r = 0; pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms, prange->start, prange->last); - addr = prange->start << PAGE_SHIFT; + gpuidx = kfd_process_gpuidx_from_gpuid(p, prange->actual_loc); + if (gpuidx < 0) { + pr_debug("no GPU id 0x%x found\n", prange->actual_loc); + return -EINVAL; + } + + addr = migrate->start; + offset = (addr >> PAGE_SHIFT) - prange->start; + dma_addr = prange->dma_addr[gpuidx]; src = (uint64_t *)(scratch + npages); dst = scratch; @@ -623,6 +635,12 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct svm_range *prange, goto out_oom; } + /* Clear VRAM flag when page is migrated to ram, to count vram +* pages correctly when spliting the range. +*/ + if (dma_addr && (dma_addr[offset + i] & SVM_RANGE_VRAM_DOMAIN)) + dma_addr[offset + i] = 0; + pr_debug_ratelimited("dma mapping dst to 0x%llx, pfn 0x%lx\n", dst[i] >> PAGE_SHIFT, page_to_pfn(dpage)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index cc24f30f88fb..35ee9e648cca 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -362,7 +362,6 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start, INIT_LIST_HEAD(>child_list); atomic_set(>invalid, 0); prange->validate_timestamp = 0; - prange->vram_pages = 0; mutex_init(>migrate_mutex); mutex_init(>lock); @@ -980,9 +979,14 @@ svm_range_split_pages(struct svm_range *new, struct svm_range *old, if (r) return r; } - if (old->actual_loc) + if (old->actual_loc && new->vram_pages) { old->vram_pages -= new->vram_pages; - + new->actual_loc = old->actual_loc; + if (!old->vram_pages) + old->actual_loc = 0; + } + pr_debug("new->vram_pages 0x%llx loc 0x%x old->vram_pages 0x%llx loc 0x%x\n", +new->vram_pages, new->actual_loc, old->vram_pages, old->actual_loc); return 0; } @@ -1002,13 +1006,14 @@ svm_range_split_nodes(struct svm_range *new, struct svm_range *old, new->offset = old->offset + npages; } - new->svm_bo = svm_range_bo_ref(old->svm_bo); - new->ttm_res = old->ttm_res; - - spin_lock(>svm_bo->list_lock); - list_add(>svm_bo_list, >svm_bo->range_list); - spin_unlock(>svm_bo->list_lock); + if (new->vram_pages) { + new->svm_bo = svm_range_bo_ref(old->svm_bo); + new->ttm_res = old->ttm_res; + spin_lock(>svm_bo->list_lock); + list_add(>svm_bo_list, >svm_bo->range_list); + spin_unlock(>svm_bo->list_lock); + } return 0; } @@ -1058,7 +1063,6 @@ svm_range_split_adjust(struct svm_range *new, struct svm_range *old, new->flags = old->flags; new->preferred_loc = old->preferred_loc; new->prefetch_loc = old->prefetch_loc; - new->actual_loc = old->actual_loc; new->granularity = old->granularity; new->mapped_to_gpu = old->mapped_to_gpu; bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE); -- 2.35.1
Re: [PATCH v2] amd/amdkfd: Set correct svm range actual loc after spliting
On 2024-01-10 11:30, Felix Kuehling wrote: On 2024-01-09 15:05, Philip Yang wrote: After svm range partial migrating to system memory, unmap to cleanup the corresponding dma_addr vram domain flag, otherwise the future split will get incorrect vram_pages and actual loc. After range spliting, set new range and old range actual_loc: new range actual_loc is 0 if new->vram_pages is 0. old range actual_loc is 0 if old->vram_pages - new->vram_pages == 0. new range takes svm_bo ref only if vram_pages not equal to 0. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 3 ++ drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 35 +++- drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 3 +- 3 files changed, 27 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index f856901055d3..e85bcda29db6 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -839,6 +839,9 @@ int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm, prange->actual_loc = 0; svm_range_vram_node_free(prange); } + + svm_range_dma_unmap(prange, start_mgr - prange->start, + last_mgr - start_mgr + 1); If this is just for clearing the VRAM flags, then we should probably create another helper function for that. DMA unmapping system memory pages that didn't even move is not necessary here. Also, as Xiaogang pointed out, the migration may have missed some pages due to page locking race conditions. If you want this to give you accurate VRAM page counts, you should only clear the VRAM flags for pages that were actually migrated. ok, understand the concern now, if failed to migrate page to system memory to recover CPU page fault, app will crash, but prefetch may fail to migrate page to system memory, will send new patch, to clear the prange->dma_addr[gpuidx] VRAM flags while migrating the range to ram. Regards, Philip Regards, Felix } return r < 0 ? r : 0; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index cc24f30f88fb..2202bdcde057 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -254,6 +254,10 @@ void svm_range_dma_unmap_dev(struct device *dev, dma_addr_t *dma_addr, return; for (i = offset; i < offset + npages; i++) { + if (dma_addr[i] & SVM_RANGE_VRAM_DOMAIN) { + dma_addr[i] = 0; + continue; + } if (!svm_is_valid_dma_mapping_addr(dev, dma_addr[i])) continue; pr_debug_ratelimited("unmap 0x%llx\n", dma_addr[i] >> PAGE_SHIFT); @@ -262,7 +266,8 @@ void svm_range_dma_unmap_dev(struct device *dev, dma_addr_t *dma_addr, } } -void svm_range_dma_unmap(struct svm_range *prange) +void svm_range_dma_unmap(struct svm_range *prange, unsigned long offset, + unsigned long npages) { struct kfd_process_device *pdd; dma_addr_t *dma_addr; @@ -284,7 +289,7 @@ void svm_range_dma_unmap(struct svm_range *prange) } dev = >dev->adev->pdev->dev; - svm_range_dma_unmap_dev(dev, dma_addr, 0, prange->npages); + svm_range_dma_unmap_dev(dev, dma_addr, offset, npages); } } @@ -299,7 +304,7 @@ static void svm_range_free(struct svm_range *prange, bool do_unmap) svm_range_vram_node_free(prange); if (do_unmap) - svm_range_dma_unmap(prang
Re: [PATCH v2] amd/amdkfd: Set correct svm range actual loc after spliting
On 2024-01-09 17:29, Chen, Xiaogang wrote: On 1/9/2024 2:05 PM, Philip Yang wrote: After svm range partial migrating to system memory, unmap to cleanup the corresponding dma_addr vram domain flag, otherwise the future split will get incorrect vram_pages and actual loc. After range spliting, set new range and old range actual_loc: new range actual_loc is 0 if new->vram_pages is 0. old range actual_loc is 0 if old->vram_pages - new->vram_pages == 0. new range takes svm_bo ref only if vram_pages not equal to 0. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 3 ++ drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 35 +++- drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 3 +- 3 files changed, 27 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index f856901055d3..e85bcda29db6 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -839,6 +839,9 @@ int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm, prange->actual_loc = 0; svm_range_vram_node_free(prange); } + + svm_range_dma_unmap(prange, start_mgr - prange->start, + last_mgr - start_mgr + 1); when come here we know some pages got migrated to sys ram, in theory we do not know if all pages got migrated. svm_range_dma_unmap does dma_unmap for all pages from start_mgr - prange->start to last_mgr - start_mgr + 1. If there are pages not migrated due to some reason(though it is rare) we still need keep its dma_addr, I think only hmm can tell that. For system page dma unmap_page and set dma_addr=0 after migration is fine because before updating GPU mapping, svm_range_validate_and_map calls svm_range_dma_map to update dma_addr for system pages. } return r < 0 ? r : 0; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index cc24f30f88fb..2202bdcde057 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -254,6 +254,10 @@ void svm_range_dma_unmap_dev(struct device *dev, dma_addr_t *dma_addr, return; for (i = offset; i < offset + npages; i++) { + if (dma_addr[i] & SVM_RANGE_VRAM_DOMAIN) { + dma_addr[i] = 0; + continue; + } same as above here set dma_addr[i]=0 unconditionally without knowing if the page is indeed in sys ram. dma_addr[i] & SVM_RANGE_VRAM_DOMAIN is for device page, system page will still call dma_unmap_page. if (!svm_is_valid_dma_mapping_addr(dev, dma_addr[i])) continue; pr_debug_ratelimited("unmap 0x%llx\n", dma_addr[i] >> PAGE_SHIFT); @@ -262,7 +266,8 @@ void svm_range_dma_unmap_dev(struct device *dev, dma_addr_t *dma_addr, } } -void svm_range_dma_unmap(struct svm_range *prange) +void svm_range_dma_unmap(struct svm_range *prange, unsigned long offset, + unsigned long npages) { struct kfd_process_device *pdd; dma_addr_t *dma_addr; @@ -284,7 +289,7 @@ void svm_range_dma_unmap(struct svm_range *prange) } dev = >dev->adev->pdev->dev; - svm_range_dma_unmap_dev(dev, dma_addr, 0, prange->npages); + svm_range_dma_unmap_dev(dev, dma_addr, offset, npages); } } @@ -299,7 +304,7 @@ static void svm_range_free(struct svm_range *prange, bool do_unmap) svm_range_vram_node_free(prange); if
[PATCH v2] amd/amdkfd: Set correct svm range actual loc after spliting
After svm range partial migrating to system memory, unmap to cleanup the corresponding dma_addr vram domain flag, otherwise the future split will get incorrect vram_pages and actual loc. After range spliting, set new range and old range actual_loc: new range actual_loc is 0 if new->vram_pages is 0. old range actual_loc is 0 if old->vram_pages - new->vram_pages == 0. new range takes svm_bo ref only if vram_pages not equal to 0. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 3 ++ drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 35 +++- drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 3 +- 3 files changed, 27 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index f856901055d3..e85bcda29db6 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -839,6 +839,9 @@ int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm, prange->actual_loc = 0; svm_range_vram_node_free(prange); } + + svm_range_dma_unmap(prange, start_mgr - prange->start, + last_mgr - start_mgr + 1); } return r < 0 ? r : 0; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index cc24f30f88fb..2202bdcde057 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -254,6 +254,10 @@ void svm_range_dma_unmap_dev(struct device *dev, dma_addr_t *dma_addr, return; for (i = offset; i < offset + npages; i++) { + if (dma_addr[i] & SVM_RANGE_VRAM_DOMAIN) { + dma_addr[i] = 0; + continue; + } if (!svm_is_valid_dma_mapping_addr(dev, dma_addr[i])) continue; pr_debug_ratelimited("unmap 0x%llx\n", dma_addr[i] >> PAGE_SHIFT); @@ -262,7 +266,8 @@ void svm_range_dma_unmap_dev(struct device *dev, dma_addr_t *dma_addr, } } -void svm_range_dma_unmap(struct svm_range *prange) +void svm_range_dma_unmap(struct svm_range *prange, unsigned long offset, +unsigned long npages) { struct kfd_process_device *pdd; dma_addr_t *dma_addr; @@ -284,7 +289,7 @@ void svm_range_dma_unmap(struct svm_range *prange) } dev = >dev->adev->pdev->dev; - svm_range_dma_unmap_dev(dev, dma_addr, 0, prange->npages); + svm_range_dma_unmap_dev(dev, dma_addr, offset, npages); } } @@ -299,7 +304,7 @@ static void svm_range_free(struct svm_range *prange, bool do_unmap) svm_range_vram_node_free(prange); if (do_unmap) - svm_range_dma_unmap(prange); + svm_range_dma_unmap(prange, 0, prange->npages); if (do_unmap && !p->xnack_enabled) { pr_debug("unreserve prange 0x%p size: 0x%llx\n", prange, size); @@ -362,7 +367,6 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start, INIT_LIST_HEAD(>child_list); atomic_set(>invalid, 0); prange->validate_timestamp = 0; - prange->vram_pages = 0; mutex_init(>migrate_mutex); mutex_init(>lock); @@ -980,9 +984,14 @@ svm_range_split_pages(struct svm_range *new, struct svm_range *old, if (r) return r; } - if (old->actual_loc) + if (old->actual_loc && new->vram_pages) { old->vram_pages -= new->vram_pages; - + new->actual_loc = old->actual_loc; + if (!old->vram_pages) + old->actual_loc = 0; + } + pr_debug("new->vram_pages 0x%llx loc 0x%x old->vram_pages 0x%llx loc 0x%x\n", +new->vram_pages, new->actual_loc, old->vram_pages, old->actual_loc); return 0; } @@ -1002,13 +1011,14 @@ svm_range_split_nodes(struct svm_range *new, struct svm_range *old, new->offset = old->offset + npages; } - new->svm_bo = svm_range_bo_ref(old->svm_bo); - new->ttm_res = old->ttm_res; - - spin_lock(>svm_bo->list_lock); - list_add(>svm_bo_list, >svm_bo->range_list); - spin_unlock(>svm_bo->list_lock); + if (new->vram_pages) { + new->svm_bo = svm_range_bo_ref(old->svm_bo); + new->ttm_res = old->ttm_res; + spin_lock(>svm_bo->list_lock); + list_add(>svm_bo_list, >svm_bo->range_list); + spin_unlock(>svm_bo->list_lock); + } return 0; } @@ -1058,7 +1068,6 @@ svm_
Re: [PATCH] amd/amdkfd: Set correct svm range actual loc after spliting
On 2024-01-08 18:17, Chen, Xiaogang wrote: With a nitpick below, this patch is Reviewed-by:Xiaogang Chen On 1/8/2024 4:36 PM, Philip Yang wrote: After range spliting, set new range and old range actual_loc: new range actual_loc is 0 if new->vram_pages is 0. old range actual_loc is 0 if old->vram_pages - new->vram_pages == 0. Signed-off-by: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 8 +--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index cc24f30f88fb..cb09e1d3a643 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -362,7 +362,6 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start, INIT_LIST_HEAD(>child_list); atomic_set(>invalid, 0); prange->validate_timestamp = 0; - prange->vram_pages = 0; I think it is better to keep it, also: +new->actual_loc = 0; though not necessary as prange is allocated by kzalloc, just keep consistent with previous statements, or remove atomic_set(>invalid, 0); prange->validate_timestamp = 0; too. kzalloc memset prange to 0, we should remove unnecessary 0 assignment. prange->validate_timestamp will be removed completely in the following patch. Will send out v2 patch to fix other related issues. Regards, Philip Regards Xiaogang mutex_init(>migrate_mutex); mutex_init(>lock); @@ -980,8 +979,12 @@ svm_range_split_pages(struct svm_range *new, struct svm_range *old, if (r) return r; } - if (old->actual_loc) + if (old->actual_loc && new->vram_pages) { old->vram_pages -= new->vram_pages; + new->actual_loc = old->actual_loc; + if (!old->vram_pages) + old->actual_loc = 0; + } return 0; } @@ -1058,7 +1061,6 @@ svm_range_split_adjust(struct svm_range *new, struct svm_range *old, new->flags = old->flags; new->preferred_loc = old->preferred_loc; new->prefetch_loc = old->prefetch_loc; - new->actual_loc = old->actual_loc; new->granularity = old->granularity; new->mapped_to_gpu = old->mapped_to_gpu; bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE);