On 3/17/26 19:17, Ruijing Dong wrote:
> amdgpu_device_get_job_timeout_settings() passes a pointer directly to
> the global amdgpu_lockup_timeout[] buffer into strsep(). strsep()
> destructively replaces delimiter characters with '\0' in-place.
>
> On multi-GPU systems, this function is called once per device. When a
> multi-value setting like "0,0,0,-1" is used, the first GPU's call
> transforms the global buffer into "0\00\00\0-1". The second GPU then
> sees only "0" (terminated at the first '\0'), parses a single value,
> hits the single-value fallthrough (index == 1), and applies timeout=0
> to all rings — causing immediate false job timeouts.
>
> Fix this by using kstrdup() to make a local copy before calling strsep(),
> so the global module parameter buffer remains intact across calls. A
> separate pointer is kept to the allocation start since strsep() advances
> the working pointer to NULL by the end of parsing.
>
> Signed-off-by: Ruijing Dong <[email protected]>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 ++++++++++++++++---
> 1 file changed, 16 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index dcae77b6c272..97ebcc5bb763 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3498,7 +3498,7 @@ static void amdgpu_device_xgmi_reset_func(struct
> work_struct *__work)
>
> static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
> {
> - char *input = amdgpu_lockup_timeout;
> + char *input, *input_copy;
> char *timeout_setting = NULL;
> int index = 0;
> long timeout;
> @@ -3508,14 +3508,25 @@ static int
> amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
> adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout =
> adev->video_timeout = msecs_to_jiffies(2000);
>
> - if (!strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH))
> + if (!strnlen(amdgpu_lockup_timeout, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH))
> return 0;
>
> + /*
> + * strsep() destructively modifies its input by replacing delimiters
> + * with '\0'. Make a local copy so the global module parameter buffer
> + * remains intact for multi-GPU systems where this function is called
> + * once per device.
> + */
> + input = kstrdup(amdgpu_lockup_timeout, GFP_KERNEL);
I think it is save to copy the parameter to the stack instead of using
kmalloc() here.
Apart from that it's a pretty good catch.
Regards,
Christian.
> + if (!input)
> + return -ENOMEM;
> + input_copy = input;
> +
> while ((timeout_setting = strsep(&input, ",")) &&
> strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
> ret = kstrtol(timeout_setting, 0, &timeout);
> if (ret)
> - return ret;
> + goto out_free;
>
> if (timeout == 0) {
> index++;
> @@ -3551,6 +3562,8 @@ static int
> amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
> adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout =
> adev->video_timeout = timeout;
>
> +out_free:
> + kfree(input_copy);
> return ret;
> }
>