On 6/22/26 16:23, Wentao Liang wrote:
> Fix two issues in amdgpu_gfx_run_cleaner_shader_job():
> 
> 1. IB buffer overflow: The indirect buffer is hardcoded to 64 bytes,
>    but the initialization loop writes up to (align_mask + 1) dwords.
>    On modern GFX rings with align_mask = 0xff, this writes 1024 bytes,
>    overflowing the 64-byte allocation and corrupting memory.
> 
> 2. Scheduler entity leak: The drm_sched_entity is not cleaned up on
>    the error path after amdgpu_job_alloc_with_ib() fails.
> 
> Fix by:
> - Dynamically calculating IB size based on ring->funcs->align_mask
> - Adding drm_sched_entity_destroy() to the error path
> 
> Cc: [email protected]
> Fixes: d361ad5d2fc0 ("drm/amdgpu: Add sysfs interface for running cleaner 
> shader")
> Fixes: 256576ed6895 ("drm/amdgpu: give each kernel job a unique id")
> Fixes: 559a285816af ("drm/amdgpu: Replace 'amdgpu_job_submit_direct' with 
> 'drm_sched_entity' in cleaner shader")
> Signed-off-by: Wentao Liang <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 16 ++++++++++++----
>  1 file changed, 12 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index b8ca876694ff..b50ec1a5c645 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -1651,6 +1651,7 @@ static int amdgpu_gfx_run_cleaner_shader_job(struct 
> amdgpu_ring *ring)
>       struct amdgpu_job *job;
>       struct amdgpu_ib *ib;
>       void *owner;
> +     unsigned int ib_size;
>       int i, r;
>  
>       /* Initialize the scheduler entity */
> @@ -1658,7 +1659,7 @@ static int amdgpu_gfx_run_cleaner_shader_job(struct 
> amdgpu_ring *ring)
>                                 &sched, 1, NULL);
>       if (r) {
>               dev_err(adev->dev, "Failed setting up GFX kernel entity.\n");
> -             goto err;
> +             return r;
>       }
>  
>       /*
> @@ -1668,8 +1669,15 @@ static int amdgpu_gfx_run_cleaner_shader_job(struct 
> amdgpu_ring *ring)
>        */
>       owner = (void *)(unsigned long)atomic_inc_return(&counter);
>  
> +     /*
> +      * Allocate IB with enough space for align_mask + 1 dwords.
> +      * The initialization loop below writes exactly this many dwords.
> +      * Each dword is 4 bytes.
> +      */
> +     ib_size = (ring->funcs->align_mask + 1) * sizeof(uint32_t);
> +

Please completely drop that.

The ring->funcs->align_mask is the ring alignment mask and has no technical 
relevance for the IB.

>       r = amdgpu_job_alloc_with_ib(ring->adev, &entity, owner,
> -                                  64, 0, &job,
> +                                  ib_size, 0, &job,
>                                    AMDGPU_KERNEL_JOB_ID_CLEANER_SHADER);
>       if (r)
>               goto err;
> @@ -1686,8 +1694,6 @@ static int amdgpu_gfx_run_cleaner_shader_job(struct 
> amdgpu_ring *ring)
>       f = amdgpu_job_submit(job);
>  
>       r = dma_fence_wait(f, false);

Please drop assigning r as well.

Regards,
Christian.

> -     if (r)
> -             goto err;
>  
>       dma_fence_put(f);
>  
> @@ -1696,6 +1702,8 @@ static int amdgpu_gfx_run_cleaner_shader_job(struct 
> amdgpu_ring *ring)
>       return 0;
>  
>  err:
> +     /* Clean up the scheduler entity */
> +     drm_sched_entity_destroy(&entity);
>       return r;
>  }
>  

Reply via email to