On Tue, Mar 17, 2026 at 2:40 PM Ruijing Dong <[email protected]> wrote:
>
> amdgpu_device_get_job_timeout_settings() passes a pointer directly
> to the global amdgpu_lockup_timeout[] buffer into strsep().
> strsep() destructively replaces delimiter characters with '\0'
> in-place.
>
> On multi-GPU systems, this function is called once per device.
> When a multi-value setting like "0,0,0,-1" is used, the first
> GPU's call transforms the global buffer into "0\00\00\0-1". The
> second GPU then sees only "0" (terminated at the first '\0'),
> parses a single value, hits the single-value fallthrough
> (index == 1), and applies timeout=0 to all rings — causing
> immediate false job timeouts.
>
> Fix this by using kstrdup() to make a local copy before calling
> strsep(), so the global module parameter buffer remains intact
> across calls. A separate pointer is kept to the allocation start
> since strsep() advances the working pointer to NULL by the end
> of parsing.
>
> v2: wrap commit message to 72 columns, add Assisted-by tag.
>
> Assisted-by: Claude:claude-opus-4-6
> Signed-off-by: Ruijing Dong <[email protected]>

Reviewed-by: Alex Deucher <[email protected]>

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 ++++++++++++++++---
>  1 file changed, 16 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index dcae77b6c272..97ebcc5bb763 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3498,7 +3498,7 @@ static void amdgpu_device_xgmi_reset_func(struct 
> work_struct *__work)
>
>  static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
>  {
> -       char *input = amdgpu_lockup_timeout;
> +       char *input, *input_copy;
>         char *timeout_setting = NULL;
>         int index = 0;
>         long timeout;
> @@ -3508,14 +3508,25 @@ static int 
> amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
>         adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout =
>                 adev->video_timeout = msecs_to_jiffies(2000);
>
> -       if (!strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH))
> +       if (!strnlen(amdgpu_lockup_timeout, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH))
>                 return 0;
>
> +       /*
> +        * strsep() destructively modifies its input by replacing delimiters
> +        * with '\0'. Make a local copy so the global module parameter buffer
> +        * remains intact for multi-GPU systems where this function is called
> +        * once per device.
> +        */
> +       input = kstrdup(amdgpu_lockup_timeout, GFP_KERNEL);
> +       if (!input)
> +               return -ENOMEM;
> +       input_copy = input;
> +
>         while ((timeout_setting = strsep(&input, ",")) &&
>                strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
>                 ret = kstrtol(timeout_setting, 0, &timeout);
>                 if (ret)
> -                       return ret;
> +                       goto out_free;
>
>                 if (timeout == 0) {
>                         index++;
> @@ -3551,6 +3562,8 @@ static int 
> amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
>                 adev->gfx_timeout = adev->compute_timeout = 
> adev->sdma_timeout =
>                         adev->video_timeout = timeout;
>
> +out_free:
> +       kfree(input_copy);
>         return ret;
>  }
>
> --
> 2.49.0.593.gd86a19f485
>

Reply via email to