Entities' gart windows are contiguous so when copying a buffer and src doesn't need a gart window, its window can be used to extend dst one (and vice versa).
This doubles the gart window size and reduces the number of jobs required. --- v2: pass adev instead of ring to amdgpu_ttm_needs_gart_window v4: - merge if conditions - always call amdgpu_ttm_needs_gart_window before calling amdgpu_ttm_map_buffer --- Signed-off-by: Pierre-Eric Pelloux-Prayer <[email protected]> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 83 +++++++++++++++++-------- 1 file changed, 57 insertions(+), 26 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 6da1ce03ce15..ad53789d0e2b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -177,6 +177,21 @@ amdgpu_ttm_job_submit(struct amdgpu_device *adev, struct amdgpu_ttm_buffer_entit return amdgpu_job_submit(job); } +static bool amdgpu_ttm_needs_gart_window(struct amdgpu_device *adev, + struct ttm_resource *mem, + struct amdgpu_res_cursor *mm_cur, + bool tmz, + uint64_t *addr) +{ + /* Map only what can't be accessed directly */ + if (!tmz && mem->start != AMDGPU_BO_INVALID_OFFSET) { + *addr = amdgpu_ttm_domain_start(adev, mem->mem_type) + + mm_cur->start; + return false; + } + return true; +} + /** * amdgpu_ttm_map_buffer - Map memory into the GART windows * @entity: entity to run the window setup job @@ -184,18 +199,22 @@ amdgpu_ttm_job_submit(struct amdgpu_device *adev, struct amdgpu_ttm_buffer_entit * @mem: memory object to map * @mm_cur: range to map * @window: which GART window to use + * @use_two_windows: if true, use a double window * @tmz: if we should setup a TMZ enabled mapping * @size: in number of bytes to map, out number of bytes mapped * @addr: resulting address inside the MC address space * * Setup one of the GART windows to access a specific piece of memory or return * the physical address for local memory. + * amdgpu_ttm_needs_gart_window() should be used to check if calling this + * function is required. */ static int amdgpu_ttm_map_buffer(struct amdgpu_ttm_buffer_entity *entity, struct ttm_buffer_object *bo, struct ttm_resource *mem, struct amdgpu_res_cursor *mm_cur, unsigned int window, + bool use_two_windows, bool tmz, uint64_t *size, uint64_t *addr) { struct amdgpu_device *adev = amdgpu_ttm_adev(bo->bdev); @@ -212,14 +231,6 @@ static int amdgpu_ttm_map_buffer(struct amdgpu_ttm_buffer_entity *entity, if (WARN_ON(mem->mem_type == AMDGPU_PL_PREEMPT)) return -EINVAL; - /* Map only what can't be accessed directly */ - if (!tmz && mem->start != AMDGPU_BO_INVALID_OFFSET) { - *addr = amdgpu_ttm_domain_start(adev, mem->mem_type) + - mm_cur->start; - return 0; - } - - /* * If start begins at an offset inside the page, then adjust the size * and addr accordingly @@ -227,7 +238,8 @@ static int amdgpu_ttm_map_buffer(struct amdgpu_ttm_buffer_entity *entity, offset = mm_cur->start & ~PAGE_MASK; num_pages = PFN_UP(*size + offset); - num_pages = min_t(uint32_t, num_pages, AMDGPU_GTT_MAX_TRANSFER_SIZE); + num_pages = min_t(uint32_t, + num_pages, AMDGPU_GTT_MAX_TRANSFER_SIZE * (use_two_windows ? 2 : 1)); *size = min(*size, (uint64_t)num_pages * PAGE_SIZE - offset); @@ -299,6 +311,7 @@ static int amdgpu_ttm_copy_mem_to_mem(struct amdgpu_device *adev, struct dma_resv *resv, struct dma_fence **f) { + bool src_needs_gart_window, dst_needs_gart_window; struct amdgpu_res_cursor src_mm, dst_mm; struct dma_fence *fence = NULL; int r = 0; @@ -323,16 +336,29 @@ static int amdgpu_ttm_copy_mem_to_mem(struct amdgpu_device *adev, /* Never copy more than 256MiB at once to avoid a timeout */ cur_size = min3(src_mm.size, dst_mm.size, 256ULL << 20); - /* Map src to window 0 and dst to window 1. */ - r = amdgpu_ttm_map_buffer(entity, src->bo, src->mem, &src_mm, - 0, tmz, &cur_size, &from); - if (r) - goto error; + /* If only one direction needs a gart window to access memory, use both + * windows for it. + */ + src_needs_gart_window = + amdgpu_ttm_needs_gart_window(adev, src->mem, &src_mm, tmz, &from); + dst_needs_gart_window = + amdgpu_ttm_needs_gart_window(adev, dst->mem, &dst_mm, tmz, &to); - r = amdgpu_ttm_map_buffer(entity, dst->bo, dst->mem, &dst_mm, - 1, tmz, &cur_size, &to); - if (r) - goto error; + if (src_needs_gart_window) { + r = amdgpu_ttm_map_buffer(entity, src->bo, src->mem, &src_mm, + 0, !dst_needs_gart_window, + tmz, &cur_size, &from); + if (r) + goto error; + } + if (dst_needs_gart_window) { + r = amdgpu_ttm_map_buffer(entity, dst->bo, dst->mem, &dst_mm, + src_needs_gart_window ? 1 : 0, + !src_needs_gart_window, + tmz, &cur_size, &to); + if (r) + goto error; + } abo_src = ttm_to_amdgpu_bo(src->bo); abo_dst = ttm_to_amdgpu_bo(dst->bo); @@ -2426,10 +2452,12 @@ int amdgpu_ttm_clear_buffer(struct amdgpu_bo *bo, /* Never clear more than 256MiB at once to avoid timeouts */ size = min(cursor.size, 256ULL << 20); - r = amdgpu_ttm_map_buffer(entity, &bo->tbo, bo->tbo.resource, &cursor, - 1, false, &size, &addr); - if (r) - goto err; + if (amdgpu_ttm_needs_gart_window(adev, bo->tbo.resource, &cursor, false, &addr)) { + r = amdgpu_ttm_map_buffer(entity, &bo->tbo, bo->tbo.resource, &cursor, + 1, false, false, &size, &addr); + if (r) + goto err; + } r = amdgpu_ttm_fill_mem(adev, entity, 0, addr, size, resv, &next, true, @@ -2476,10 +2504,13 @@ int amdgpu_fill_buffer(struct amdgpu_ttm_buffer_entity *entity, /* Never fill more than 256MiB at once to avoid timeouts */ cur_size = min(dst.size, 256ULL << 20); - r = amdgpu_ttm_map_buffer(entity, &bo->tbo, bo->tbo.resource, &dst, - 1, false, &cur_size, &to); - if (r) - goto error; + if (amdgpu_ttm_needs_gart_window(adev, bo->tbo.resource, &dst, false, &to)) { + r = amdgpu_ttm_map_buffer(entity, &bo->tbo, bo->tbo.resource, &dst, + 1, false, false, + &cur_size, &to); + if (r) + goto error; + } r = amdgpu_ttm_fill_mem(adev, entity, src_data, to, cur_size, resv, -- 2.43.0
