amdgpu: use multiple entities in amdgpu_fill_buffer

Pierre-Eric Pelloux-Prayer Tue, 04 Nov 2025 00:37:06 -0800

The benefits of using multiple entities is that multiple fill jobs
can run in parallel. Otherwise, even if the entity has access
to multiple engines, a burst of N independent jobs will all
run on the same engine because an entity guarantees the ordering
of execution matches the ordering of the submission.


Callers can opt-out of this behavior by passing the entity they
want to use (see amdgpu_move_blit).

Signed-off-by: Pierre-Eric Pelloux-Prayer <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 84 ++++++++++++++++++-------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h |  1 +
 2 files changed, 64 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index c357a6d9763a..839ea8c7f6be 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -2224,6 +2224,7 @@ u32 amdgpu_ttm_set_buffer_funcs_status(struct 
amdgpu_device *adev, bool enable)
                adev->mman.clear_entities = kcalloc(num_clear_entities,
                                                    sizeof(struct 
amdgpu_ttm_entity),
                                                    GFP_KERNEL);
+               atomic_set(&adev->mman.next_clear_entity, 0);
                if (!adev->mman.clear_entities)
                        goto error_free_entity;
 
@@ -2498,10 +2499,12 @@ int amdgpu_fill_buffer(struct amdgpu_ttm_entity *entity,
 {
        struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
        struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
+       struct dma_fence *fences[TTM_FENCES_MAX_SLOT_COUNT] = {};
        struct dma_fence *fence = NULL;
        struct dma_resv *resv = NULL;
        struct amdgpu_res_cursor dst;
-       int r;
+       uint64_t cur_size, to;
+       int r, e, n_fences;
 
        /* The fences will be either added to the resv object or the last fence
         * will be returned to the caller. In the latter case, all fill jobs 
will
@@ -2515,53 +2518,92 @@ int amdgpu_fill_buffer(struct amdgpu_ttm_entity *entity,
        }
 
        if (!entity) {
-               entity = &adev->mman.clear_entities[0];
                resv = &bo->tbo.base._resv;
-               r = dma_resv_reserve_fences(resv, 1);
+
+               /* Determine how much fences we're going to add to the
+                * resv object.
+                */
+               n_fences = 0;
+               amdgpu_res_first(bo->tbo.resource, 0, amdgpu_bo_size(bo), &dst);
+               while (dst.remaining) {
+                       cur_size = min(dst.size, 256ULL << 20);
+
+                       n_fences += 1;
+                       amdgpu_res_next(&dst, cur_size);
+               }
+               if (n_fences == 0)
+                       return 0;
+
+               /* One slot per entity at most. */
+               n_fences = MIN(n_fences, adev->mman.num_clear_entities);
+
+               r = dma_resv_reserve_fences(resv, n_fences);
                if (r)
                        return r;
+       } else {
+               mutex_lock(&entity->gart_window_lock);
        }
 
        amdgpu_res_first(bo->tbo.resource, 0, amdgpu_bo_size(bo), &dst);
 
-       mutex_lock(&entity->gart_window_lock);
        while (dst.remaining) {
-               struct dma_fence *next;
-               uint64_t cur_size, to;
-
                /* Never fill more than 256MiB at once to avoid timeouts */
                cur_size = min(dst.size, 256ULL << 20);
 
+               if (resv) {
+                       /* Pick a new entity for each partial clear so they can
+                        * execute in parallel.
+                        */
+                       e = atomic_inc_return(&adev->mman.next_clear_entity) %
+                               adev->mman.num_clear_entities;
+                       entity = &adev->mman.clear_entities[e];
+                       mutex_lock(&entity->gart_window_lock);
+               }
+
                r = amdgpu_ttm_map_buffer(&entity->base,
                                          &bo->tbo, bo->tbo.resource, &dst,
                                          entity->gart_window_id1, ring, false,
                                          &cur_size, &to,
                                          dependency,
                                          resv);
-               if (r)
+               if (r) {
+                       mutex_unlock(&entity->gart_window_lock);
                        goto error;
+               }
 
                r = amdgpu_ttm_fill_mem(ring, &entity->base,
                                        src_data, to, cur_size, resv,
-                                       &next, true, k_job_id);
-               if (r)
+                                       &fence, true, k_job_id);
+               if (r) {
+                       mutex_unlock(&entity->gart_window_lock);
                        goto error;
-
-               if (resv) {
-                       dma_resv_add_fence(resv, next, DMA_RESV_USAGE_KERNEL);
-                       dma_fence_put(next);
-               } else {
-                       dma_fence_put(fence);
-                       fence = next;
                }
 
                amdgpu_res_next(&dst, cur_size);
+
+               if (resv) {
+                       /* Delay the addition of the fences to resv, otherwise 
the next partial
+                        * clears will depend on this one.
+                        */
+                       fences[e] = fence;
+                       mutex_unlock(&entity->gart_window_lock);
+               } else {
+                       dma_fence_put(*f);
+                       *f = fence;
+               }
        }
 error:
-       mutex_unlock(&entity->gart_window_lock);
-       if (f)
-               *f = dma_fence_get(fence);
-       dma_fence_put(fence);
+       if (resv) {
+               for (e = 0; e < adev->mman.num_clear_entities; e++) {
+                       if (fences[e]) {
+                               dma_resv_add_fence(resv, fences[e], 
DMA_RESV_USAGE_KERNEL);
+                               dma_fence_put(fences[e]);
+                       }
+               }
+       } else {
+               mutex_unlock(&entity->gart_window_lock);
+       }
+
        return r;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
index 38df2b5b4bc7..3fc31c7c6bfe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
@@ -73,6 +73,7 @@ struct amdgpu_mman {
 
        struct amdgpu_ttm_entity default_entity; /* has no gart windows */
        struct amdgpu_ttm_entity *clear_entities;
+       atomic_t next_clear_entity;
        u32 num_clear_entities;
        struct amdgpu_ttm_entity move_entities[TTM_FENCES_MAX_SLOT_COUNT];
        u32 num_move_entities;
-- 
2.43.0

[PATCH v1 11/20] drm/amdgpu: use multiple entities in amdgpu_fill_buffer

Reply via email to