amdgpu_device_get_job_timeout_settings() passes a pointer directly to the global amdgpu_lockup_timeout[] buffer into strsep(). strsep() destructively replaces delimiter characters with '\0' in-place.
On multi-GPU systems, this function is called once per device. When a multi-value setting like "0,0,0,-1" is used, the first GPU's call transforms the global buffer into "0\00\00\0-1". The second GPU then sees only "0" (terminated at the first '\0'), parses a single value, hits the single-value fallthrough (index == 1), and applies timeout=0 to all rings — causing immediate false job timeouts. Fix this by using kstrdup() to make a local copy before calling strsep(), so the global module parameter buffer remains intact across calls. A separate pointer is kept to the allocation start since strsep() advances the working pointer to NULL by the end of parsing. v2: wrap commit message to 72 columns, add Assisted-by tag. Assisted-by: Claude:claude-opus-4-6 Signed-off-by: Ruijing Dong <[email protected]> --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index dcae77b6c272..97ebcc5bb763 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3498,7 +3498,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) { - char *input = amdgpu_lockup_timeout; + char *input, *input_copy; char *timeout_setting = NULL; int index = 0; long timeout; @@ -3508,14 +3508,25 @@ static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout = adev->video_timeout = msecs_to_jiffies(2000); - if (!strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) + if (!strnlen(amdgpu_lockup_timeout, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) return 0; + /* + * strsep() destructively modifies its input by replacing delimiters + * with '\0'. Make a local copy so the global module parameter buffer + * remains intact for multi-GPU systems where this function is called + * once per device. + */ + input = kstrdup(amdgpu_lockup_timeout, GFP_KERNEL); + if (!input) + return -ENOMEM; + input_copy = input; + while ((timeout_setting = strsep(&input, ",")) && strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { ret = kstrtol(timeout_setting, 0, &timeout); if (ret) - return ret; + goto out_free; if (timeout == 0) { index++; @@ -3551,6 +3562,8 @@ static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout = adev->video_timeout = timeout; +out_free: + kfree(input_copy); return ret; } -- 2.49.0.593.gd86a19f485
