On 11/21/25 11:12, Pierre-Eric Pelloux-Prayer wrote:
> If multiple entities share the same window we must make sure
> that jobs using them are executed sequentially.
> 
> This commit gives separate windows to each entity, so jobs
> from multiple entities could execute in parallel if needed.
> (for now they all use the first sdma engine, so it makes no
> difference yet).
> The entity stores the gart window offsets to centralize the
> "window id" to "window offset" in a single place.
> 
> default_entity doesn't get any windows reserved since there is
> no use for them.
> 
> ---
> v3:
> - renamed gart_window_lock -> lock (Christian)
> - added amdgpu_ttm_buffer_entity_init (Christian)
> - fixed gart_addr in svm_migrate_gart_map (Felix)
> - renamed gart_window_idX -> gart_window_offs[]
> - added amdgpu_compute_gart_address
> ---
> 
> Signed-off-by: Pierre-Eric Pelloux-Prayer <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c  |  6 +--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c  | 56 ++++++++++++++++++------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h  | 14 +++++-
>  drivers/gpu/drm/amd/amdkfd/kfd_migrate.c |  9 ++--
>  4 files changed, 61 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> index 94e07b9ec7b4..0d2784fe0be3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> @@ -686,7 +686,7 @@ void amdgpu_gmc_flush_gpu_tlb(struct amdgpu_device *adev, 
> uint32_t vmid,
>        * translation. Avoid this by doing the invalidation from the SDMA
>        * itself at least for GART.
>        */
> -     mutex_lock(&adev->mman.gtt_window_lock);
> +     mutex_lock(&adev->mman.default_entity.lock);
>       r = amdgpu_job_alloc_with_ib(ring->adev, 
> &adev->mman.default_entity.base,
>                                    AMDGPU_FENCE_OWNER_UNDEFINED,
>                                    16 * 4, AMDGPU_IB_POOL_IMMEDIATE,
> @@ -699,7 +699,7 @@ void amdgpu_gmc_flush_gpu_tlb(struct amdgpu_device *adev, 
> uint32_t vmid,
>       job->ibs->ptr[job->ibs->length_dw++] = ring->funcs->nop;
>       amdgpu_ring_pad_ib(ring, &job->ibs[0]);
>       fence = amdgpu_job_submit(job);
> -     mutex_unlock(&adev->mman.gtt_window_lock);
> +     mutex_unlock(&adev->mman.default_entity.lock);
>  
>       dma_fence_wait(fence, false);
>       dma_fence_put(fence);
> @@ -707,7 +707,7 @@ void amdgpu_gmc_flush_gpu_tlb(struct amdgpu_device *adev, 
> uint32_t vmid,
>       return;
>  
>  error_alloc:
> -     mutex_unlock(&adev->mman.gtt_window_lock);
> +     mutex_unlock(&adev->mman.default_entity.lock);
>       dev_err(adev->dev, "Error flushing GPU TLB using the SDMA (%d)!\n", r);
>  }
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> index 57dff2df433b..1371a40d4687 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> @@ -229,9 +229,7 @@ static int amdgpu_ttm_map_buffer(struct amdgpu_device 
> *adev,
>  
>       *size = min(*size, (uint64_t)num_pages * PAGE_SIZE - offset);
>  
> -     *addr = adev->gmc.gart_start;
> -     *addr += (u64)window * AMDGPU_GTT_MAX_TRANSFER_SIZE *
> -             AMDGPU_GPU_PAGE_SIZE;
> +     *addr = amdgpu_compute_gart_address(&adev->gmc, entity, window);
>       *addr += offset;
>  
>       num_dw = ALIGN(adev->mman.buffer_funcs->copy_num_dw, 8);
> @@ -249,7 +247,7 @@ static int amdgpu_ttm_map_buffer(struct amdgpu_device 
> *adev,
>       src_addr += job->ibs[0].gpu_addr;
>  
>       dst_addr = amdgpu_bo_gpu_offset(adev->gart.bo);
> -     dst_addr += window * AMDGPU_GTT_MAX_TRANSFER_SIZE * 8;
> +     dst_addr += (entity->gart_window_offs[window] >> AMDGPU_GPU_PAGE_SHIFT) 
> * 8;
>       amdgpu_emit_copy_buffer(adev, &job->ibs[0], src_addr,
>                               dst_addr, num_bytes, 0);
>  
> @@ -314,7 +312,7 @@ static int amdgpu_ttm_copy_mem_to_mem(struct 
> amdgpu_device *adev,
>       amdgpu_res_first(src->mem, src->offset, size, &src_mm);
>       amdgpu_res_first(dst->mem, dst->offset, size, &dst_mm);
>  
> -     mutex_lock(&adev->mman.gtt_window_lock);
> +     mutex_lock(&entity->lock);
>       while (src_mm.remaining) {
>               uint64_t from, to, cur_size, tiling_flags;
>               uint32_t num_type, data_format, max_com, write_compress_disable;
> @@ -371,7 +369,7 @@ static int amdgpu_ttm_copy_mem_to_mem(struct 
> amdgpu_device *adev,
>               amdgpu_res_next(&dst_mm, cur_size);
>       }
>  error:
> -     mutex_unlock(&adev->mman.gtt_window_lock);
> +     mutex_unlock(&entity->lock);
>       *f = fence;
>       return r;
>  }
> @@ -1876,6 +1874,27 @@ static void amdgpu_ttm_mmio_remap_bo_fini(struct 
> amdgpu_device *adev)
>       adev->rmmio_remap.bo = NULL;
>  }
>  
> +static int amdgpu_ttm_buffer_entity_init(struct amdgpu_ttm_buffer_entity 
> *entity,
> +                                      int starting_gart_window,
> +                                      bool needs_src_gart_window,
> +                                      bool needs_dst_gart_window)
> +{
> +     mutex_init(&entity->lock);
> +     if (needs_src_gart_window) {
> +             entity->gart_window_offs[0] =
> +                     (u64)starting_gart_window * 
> AMDGPU_GTT_MAX_TRANSFER_SIZE *
> +                             AMDGPU_GPU_PAGE_SIZE;
> +             starting_gart_window++;
> +     }
> +     if (needs_dst_gart_window) {
> +             entity->gart_window_offs[1] =
> +                     (u64)starting_gart_window * 
> AMDGPU_GTT_MAX_TRANSFER_SIZE *
> +                             AMDGPU_GPU_PAGE_SIZE;
> +             starting_gart_window++;
> +     }
> +     return starting_gart_window;
> +}
> +
>  /*
>   * amdgpu_ttm_init - Init the memory management (ttm) as well as various
>   * gtt/vram related fields.
> @@ -1890,8 +1909,6 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
>       uint64_t gtt_size;
>       int r;
>  
> -     mutex_init(&adev->mman.gtt_window_lock);
> -
>       dma_set_max_seg_size(adev->dev, UINT_MAX);
>       /* No others user of address space so set it to 0 */
>       r = ttm_device_init(&adev->mman.bdev, &amdgpu_bo_driver, adev->dev,
> @@ -2161,6 +2178,7 @@ void amdgpu_ttm_fini(struct amdgpu_device *adev)
>  void amdgpu_ttm_set_buffer_funcs_status(struct amdgpu_device *adev, bool 
> enable)
>  {
>       struct ttm_resource_manager *man = ttm_manager_type(&adev->mman.bdev, 
> TTM_PL_VRAM);
> +     u32 used_windows;
>       uint64_t size;
>       int r;
>  
> @@ -2204,6 +2222,14 @@ void amdgpu_ttm_set_buffer_funcs_status(struct 
> amdgpu_device *adev, bool enable)
>                       drm_sched_entity_destroy(&adev->mman.clear_entity.base);
>                       goto error_free_entity;
>               }
> +
> +             /* Statically assign GART windows to each entity. */
> +             used_windows = 
> amdgpu_ttm_buffer_entity_init(&adev->mman.default_entity,
> +                                                          0, false, false);
> +             used_windows = 
> amdgpu_ttm_buffer_entity_init(&adev->mman.move_entity,
> +                                                          used_windows, 
> true, true);
> +             used_windows = 
> amdgpu_ttm_buffer_entity_init(&adev->mman.clear_entity,
> +                                                          used_windows, 
> false, true);
>       } else {
>               drm_sched_entity_destroy(&adev->mman.default_entity.base);
>               drm_sched_entity_destroy(&adev->mman.clear_entity.base);
> @@ -2365,6 +2391,7 @@ int amdgpu_ttm_clear_buffer(struct amdgpu_device *adev,
>                           struct dma_resv *resv,
>                           struct dma_fence **fence)
>  {
> +     struct amdgpu_ttm_buffer_entity *entity;
>       struct amdgpu_res_cursor cursor;
>       u64 addr;
>       int r = 0;
> @@ -2375,11 +2402,12 @@ int amdgpu_ttm_clear_buffer(struct amdgpu_device 
> *adev,
>       if (!fence)
>               return -EINVAL;
>  
> +     entity = &adev->mman.clear_entity;
>       *fence = dma_fence_get_stub();
>  
>       amdgpu_res_first(bo->tbo.resource, 0, amdgpu_bo_size(bo), &cursor);
>  
> -     mutex_lock(&adev->mman.gtt_window_lock);
> +     mutex_lock(&entity->lock);
>       while (cursor.remaining) {
>               struct dma_fence *next = NULL;
>               u64 size;
> @@ -2392,13 +2420,13 @@ int amdgpu_ttm_clear_buffer(struct amdgpu_device 
> *adev,
>               /* Never clear more than 256MiB at once to avoid timeouts */
>               size = min(cursor.size, 256ULL << 20);
>  
> -             r = amdgpu_ttm_map_buffer(adev, &adev->mman.clear_entity,
> +             r = amdgpu_ttm_map_buffer(adev, entity,
>                                         &bo->tbo, bo->tbo.resource, &cursor,
>                                         1, false, &size, &addr);
>               if (r)
>                       goto err;
>  
> -             r = amdgpu_ttm_fill_mem(adev, &adev->mman.clear_entity, 0, 
> addr, size, resv,
> +             r = amdgpu_ttm_fill_mem(adev, entity, 0, addr, size, resv,
>                                       &next, true,
>                                       AMDGPU_KERNEL_JOB_ID_TTM_CLEAR_BUFFER);
>               if (r)
> @@ -2410,7 +2438,7 @@ int amdgpu_ttm_clear_buffer(struct amdgpu_device *adev,
>               amdgpu_res_next(&cursor, size);
>       }
>  err:
> -     mutex_unlock(&adev->mman.gtt_window_lock);
> +     mutex_unlock(&entity->lock);
>  
>       return r;
>  }
> @@ -2435,7 +2463,7 @@ int amdgpu_fill_buffer(struct amdgpu_device *adev,
>  
>       amdgpu_res_first(bo->tbo.resource, 0, amdgpu_bo_size(bo), &dst);
>  
> -     mutex_lock(&adev->mman.gtt_window_lock);
> +     mutex_lock(&entity->lock);
>       while (dst.remaining) {
>               struct dma_fence *next;
>               uint64_t cur_size, to;
> @@ -2461,7 +2489,7 @@ int amdgpu_fill_buffer(struct amdgpu_device *adev,
>               amdgpu_res_next(&dst, cur_size);
>       }
>  error:
> -     mutex_unlock(&adev->mman.gtt_window_lock);
> +     mutex_unlock(&entity->lock);
>       if (f)
>               *f = dma_fence_get(fence);
>       dma_fence_put(fence);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> index d0f55a7edd30..a7eed678bd3f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> @@ -29,6 +29,7 @@
>  #include <drm/ttm/ttm_placement.h>
>  #include "amdgpu_vram_mgr.h"
>  #include "amdgpu_hmm.h"
> +#include "amdgpu_gmc.h"
>  
>  #define AMDGPU_PL_GDS                (TTM_PL_PRIV + 0)
>  #define AMDGPU_PL_GWS                (TTM_PL_PRIV + 1)
> @@ -39,7 +40,7 @@
>  #define __AMDGPU_PL_NUM      (TTM_PL_PRIV + 6)
>  
>  #define AMDGPU_GTT_MAX_TRANSFER_SIZE 512
> -#define AMDGPU_GTT_NUM_TRANSFER_WINDOWS      2
> +#define AMDGPU_GTT_NUM_TRANSFER_WINDOWS      3
>  
>  extern const struct attribute_group amdgpu_vram_mgr_attr_group;
>  extern const struct attribute_group amdgpu_gtt_mgr_attr_group;
> @@ -54,6 +55,8 @@ struct amdgpu_gtt_mgr {
>  
>  struct amdgpu_ttm_buffer_entity {
>       struct drm_sched_entity base;
> +     struct mutex            lock;
> +     u32                     gart_window_offs[2];

u64 here please. The theoretical size of the GART is way larger than 4GiB.

>  };
>  
>  struct amdgpu_mman {
> @@ -69,7 +72,7 @@ struct amdgpu_mman {
>  
>       struct mutex                            gtt_window_lock;
>  
> -     struct amdgpu_ttm_buffer_entity default_entity;
> +     struct amdgpu_ttm_buffer_entity default_entity; /* has no gart windows 
> */

Comments above the field, not after. Best practice would be to use kerneldoc 
style.

E.g. something like:

/* @default_entity: for workarounds, has no gart windows... */

>       struct amdgpu_ttm_buffer_entity clear_entity;
>       struct amdgpu_ttm_buffer_entity move_entity;
>  
> @@ -201,6 +204,13 @@ static inline int amdgpu_ttm_tt_get_user_pages(struct 
> amdgpu_bo *bo,
>  }
>  #endif
>  
> +static inline u64 amdgpu_compute_gart_address(struct amdgpu_gmc *gmc,
> +                                           struct amdgpu_ttm_buffer_entity 
> *entity,
> +                                           int index)

Kerneldoc would be nice to have for that function.

Regards,
Christian.

> +{
> +     return gmc->gart_start + entity->gart_window_offs[index];
> +}
> +
>  void amdgpu_ttm_tt_set_user_pages(struct ttm_tt *ttm, struct 
> amdgpu_hmm_range *range);
>  int amdgpu_ttm_tt_get_userptr(const struct ttm_buffer_object *tbo,
>                             uint64_t *user_addr);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
> index 9c76f1ba0e55..0cc1d2b35026 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
> @@ -59,8 +59,7 @@ svm_migrate_gart_map(struct amdgpu_ring *ring,
>       void *cpu_addr;
>       int r;
>  
> -     /* use gart window 0 */
> -     *gart_addr = adev->gmc.gart_start;
> +     *gart_addr = amdgpu_compute_gart_address(&adev->gmc, entity, 0);
>  
>       num_dw = ALIGN(adev->mman.buffer_funcs->copy_num_dw, 8);
>       num_bytes = npages * 8;
> @@ -116,7 +115,7 @@ svm_migrate_gart_map(struct amdgpu_ring *ring,
>   * multiple GTT_MAX_PAGES transfer, all sdma operations are serialized, wait 
> for
>   * the last sdma finish fence which is returned to check copy memory is done.
>   *
> - * Context: Process context, takes and releases gtt_window_lock
> + * Context: Process context
>   *
>   * Return:
>   * 0 - OK, otherwise error code
> @@ -138,7 +137,7 @@ svm_migrate_copy_memory_gart(struct amdgpu_device *adev, 
> dma_addr_t *sys,
>  
>       entity = &adev->mman.move_entity;
>  
> -     mutex_lock(&adev->mman.gtt_window_lock);
> +     mutex_lock(&entity->lock);
>  
>       while (npages) {
>               size = min(GTT_MAX_PAGES, npages);
> @@ -175,7 +174,7 @@ svm_migrate_copy_memory_gart(struct amdgpu_device *adev, 
> dma_addr_t *sys,
>       }
>  
>  out_unlock:
> -     mutex_unlock(&adev->mman.gtt_window_lock);
> +     mutex_unlock(&entity->lock);
>  
>       return r;
>  }

Reply via email to