On 12/8/25 16:52, Philip Yang wrote:
> On 2025-12-08 03:40, Christian König wrote:
>> On 12/5/25 22:49, Philip Yang wrote:
>>> MQD BO on VRAM access via FB aperture is mtype UC uncaching, map
>>> to GART as mtype RW caching, to reduce queue switch latency.
>>>
>>> Add helper amdgpu_ttm_alloc/free_gart_entries.
>>> Add helper amdgpu_ttm_gart_bind_gfx9_mqd_vram to bind VRAM pages
>>> to GART mapping.
>>>
>>> Add GART drm mm_node to kfd mem obj to free the GART entries after
>>> MQD is freed.
>>>
>>> Signed-off-by: Philip Yang <[email protected]>
>>> ---
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 103 ++++++++++++++++++
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 8 ++
>>> drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c | 1 +
>>> .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | 9 ++
>>> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 +
>>> 5 files changed, 122 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>> index 4f8bc7f35cdc..fc6f4daa9b87 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>> @@ -880,6 +880,42 @@ static void amdgpu_ttm_gart_bind_gfx9_mqd(struct
>>> amdgpu_device *adev,
>>> }
>>> }
>>> +static void amdgpu_ttm_gart_bind_gfx9_mqd_vram(struct amdgpu_device
>>> *adev,
>>> + struct ttm_buffer_object *tbo,
>>> + struct drm_mm_node *mm_node,
>>> + uint64_t flags)
>>> +{
>>> + uint64_t total_pages;
>>> + int num_xcc = max(1U, adev->gfx.num_xcc_per_xcp);
>>> + uint64_t page_idx, pages_per_xcc;
>>> + uint64_t ctrl_flags = flags;
>>> + int i;
>>> +
>>> + total_pages = tbo->resource->size >> PAGE_SHIFT;
>>> +
>>> + amdgpu_gmc_get_vm_pte(adev, NULL, NULL, AMDGPU_VM_MTYPE_NC,
>>> &ctrl_flags);
>>> +
>>> + if (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 4, 3))
>>> + amdgpu_gmc_get_vm_pte(adev, NULL, NULL, AMDGPU_VM_MTYPE_RW,
>>> &flags);
>>> +
>>> + pages_per_xcc = total_pages;
>>> + do_div(pages_per_xcc, num_xcc);
>>> +
>>> + for (i = 0, page_idx = 0; i < num_xcc; i++, page_idx += pages_per_xcc)
>>> {
>>> + u64 pa = (tbo->resource->start + page_idx) << PAGE_SHIFT;
>>> + u64 start_page = mm_node->start + page_idx;
>> Don't use resource->start and ḿm_node->start directly. Use the resource
>> iterators for that.
> VRAM resource allocated with AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS in previous
> patch, is in one block, GART entries allocated from
> drm_mm_insert_node_in_range is always in one block. The MQD size is 32 pages
> for MI300 and 6 pages for VG10, use contiguous allocation is fine unless the
> VRAM is fragmented too much, or I can remove the
> AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS flag in this patch, and then use resource
> iterators to update GART mapping.
It doesn't matter if the block is contigious or not, the point is you should
not touch resource->start nor mm_node->start directly.
That is a deprecated field from TTM and internals of the VRAM manager backend,
both should not be touched here.
Regards,
Christian.
>>
>>> +
>>> + pa += adev->vm_manager.vram_base_offset;
>>> + amdgpu_gart_map_vram_range(adev, pa, start_page, 1,
>>> + flags, NULL);
>>> +
>>> + amdgpu_gart_map_vram_range(adev, pa + PAGE_SIZE,
>>> + start_page + 1,
>>> + pages_per_xcc - 1,
>>> + ctrl_flags, NULL);
>>> + }
>>> +}
>>> +s
>>> static void amdgpu_ttm_gart_bind(struct amdgpu_device *adev,
>>> struct ttm_buffer_object *tbo,
>>> uint64_t flags)
>>> @@ -1017,6 +1053,73 @@ int amdgpu_ttm_alloc_gart(struct ttm_buffer_object
>>> *bo)
>>> return 0;
>>> }
>>> +int amdgpu_ttm_alloc_gart_entries(struct amdgpu_device *adev,
>>> + struct drm_mm_node *mm_node,
>>> + u64 num_pages)
>>> +{
>>> + struct ttm_resource_manager *man;
>>> + struct amdgpu_gtt_mgr *mgr;
>>> + int r;
>>> +
>>> + man = ttm_manager_type(&adev->mman.bdev, TTM_PL_TT);
>>> + mgr = container_of(man, struct amdgpu_gtt_mgr, manager);
>>> +
>>> + spin_lock(&mgr->lock);
>>> + r = drm_mm_insert_node_in_range(&mgr->mm, mm_node, num_pages,
>>> + 0, 0, 0,
>>> + adev->gmc.gart_size >> PAGE_SHIFT,
>>> + DRM_MM_INSERT_BEST);
>> That belongs into amdgpu_gtt_mgr.c and clearly not here!
> Yes, I will move the helper function to amdgpu_gtt_mgr.c
>
> Regards,
> Philip
>>
>> Regards,
>> Christian.
>>
>>> + spin_unlock(&mgr->lock);
>>> + return r;
>>> +}
>>> +
>>> +void amdgpu_ttm_free_gart_entries(struct amdgpu_device *adev,
>>> + struct drm_mm_node *mm_node)
>>> +{
>>> + struct ttm_resource_manager *man;
>>> + struct amdgpu_gtt_mgr *mgr;
>>> +
>>> + man = ttm_manager_type(&adev->mman.bdev, TTM_PL_TT);
>>> + mgr = container_of(man, struct amdgpu_gtt_mgr, manager);
>>> +
>>> + spin_lock(&mgr->lock);
>>> + if (drm_mm_node_allocated(mm_node))
>>> + drm_mm_remove_node(mm_node);
>>> + spin_unlock(&mgr->lock);
>>> +}
>>> +
>>> +/*
>>> + * amdgpu_ttm_alloc_gart_vram_bo - Bind VRAM pages to GART mapping
>>> + *
>>> + * call amdgpu_ttm_alloc_gart_entries to alloc GART dynamically
>>> + */
>>> +int amdgpu_ttm_alloc_gart_vram_bo(struct amdgpu_bo *abo,
>>> + struct drm_mm_node *mm_node,
>>> + u64 *gpu_addr)
>>> +{
>>> + struct ttm_buffer_object *bo = &abo->tbo;
>>> + struct amdgpu_device *adev = amdgpu_ttm_adev(bo->bdev);
>>> + uint64_t flags;
>>> + int r;
>>> +
>>> + /* Only for valid VRAM bo resource */
>>> + if (bo->resource->start == AMDGPU_BO_INVALID_OFFSET)
>>> + return 0;
>>> +
>>> + r = amdgpu_ttm_alloc_gart_entries(adev, mm_node,
>>> + amdgpu_bo_ngpu_pages(abo));
>>> + if (r)
>>> + return r;
>>> +
>>> + /* compute PTE flags for this buffer object */
>>> + flags = amdgpu_ttm_tt_pte_flags(adev, NULL, bo->resource);
>>> + amdgpu_ttm_gart_bind_gfx9_mqd_vram(adev, bo, mm_node, flags);
>>> + amdgpu_gart_invalidate_tlb(adev);
>>> +
>>> + *gpu_addr = mm_node->start << PAGE_SHIFT;
>>> + return 0;
>>> +}
>>> +
>>> /*
>>> * amdgpu_ttm_recover_gart - Rebind GTT pages
>>> *
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>> index 72488124aa59..cb6123358843 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>> @@ -185,6 +185,14 @@ int amdgpu_fill_buffer(struct amdgpu_ttm_buffer_entity
>>> *entity,
>>> u64 k_job_id);
>>> int amdgpu_ttm_alloc_gart(struct ttm_buffer_object *bo);
>>> +int amdgpu_ttm_alloc_gart_vram_bo(struct amdgpu_bo *abo,
>>> + struct drm_mm_node *mm_node,
>>> + u64 *gpu_addr);
>>> +int amdgpu_ttm_alloc_gart_entries(struct amdgpu_device *adev,
>>> + struct drm_mm_node *mm_node,
>>> + u64 num_pages);
>>> +void amdgpu_ttm_free_gart_entries(struct amdgpu_device *adev,
>>> + struct drm_mm_node *mm_node);
>>> void amdgpu_ttm_recover_gart(struct ttm_buffer_object *tbo);
>>> uint64_t amdgpu_ttm_domain_start(struct amdgpu_device *adev, uint32_t
>>> type);
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
>>> index f78b249e1a41..00e1e5b30a3a 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
>>> @@ -225,6 +225,7 @@ void kfd_free_mqd_cp(struct mqd_manager *mm, void *mqd,
>>> struct kfd_mem_obj *mqd_mem_obj)
>>> {
>>> if (mqd_mem_obj->mem) {
>>> + amdgpu_ttm_free_gart_entries(mm->dev->adev, &mqd_mem_obj->mm_node);
>>> amdgpu_amdkfd_free_kernel_mem(mm->dev->adev, &mqd_mem_obj->mem);
>>> kfree(mqd_mem_obj);
>>> } else {
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
>>> index 14123e1a9716..5828220056bd 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
>>> @@ -148,6 +148,15 @@ static struct kfd_mem_obj *allocate_mqd(struct
>>> kfd_node *node,
>>> kfree(mqd_mem_obj);
>>> return NULL;
>>> }
>>> +
>>> + retval = amdgpu_ttm_alloc_gart_vram_bo(mqd_mem_obj->mem,
>>> + &mqd_mem_obj->mm_node,
>>> + &(mqd_mem_obj->gpu_addr));
>>> + if (retval) {
>>> + amdgpu_amdkfd_free_kernel_mem(node->adev, &(mqd_mem_obj->mem));
>>> + kfree(mqd_mem_obj);
>>> + return NULL;
>>> + }
>>> } else {
>>> retval = kfd_gtt_sa_allocate(node, sizeof(struct v9_mqd),
>>> &mqd_mem_obj);
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> index 29419b3249cf..fdde907836fb 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> @@ -252,6 +252,7 @@ struct kfd_mem_obj {
>>> uint64_t gpu_addr;
>>> uint32_t *cpu_ptr;
>>> void *mem;
>>> + struct drm_mm_node mm_node;
>>> };
>>> struct kfd_vmid_info {
>