amdgpu_device_get_job_timeout_settings() passes a pointer directly to
the global amdgpu_lockup_timeout[] buffer into strsep(). strsep()
destructively replaces delimiter characters with '\0' in-place.

On multi-GPU systems, this function is called once per device. When a
multi-value setting like "0,0,0,-1" is used, the first GPU's call
transforms the global buffer into "0\00\00\0-1". The second GPU then
sees only "0" (terminated at the first '\0'), parses a single value,
hits the single-value fallthrough (index == 1), and applies timeout=0
to all rings — causing immediate false job timeouts.

Fix this by using kstrdup() to make a local copy before calling strsep(),
so the global module parameter buffer remains intact across calls. A
separate pointer is kept to the allocation start since strsep() advances
the working pointer to NULL by the end of parsing.

Signed-off-by: Ruijing Dong <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index dcae77b6c272..97ebcc5bb763 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3498,7 +3498,7 @@ static void amdgpu_device_xgmi_reset_func(struct 
work_struct *__work)
 
 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
 {
-       char *input = amdgpu_lockup_timeout;
+       char *input, *input_copy;
        char *timeout_setting = NULL;
        int index = 0;
        long timeout;
@@ -3508,14 +3508,25 @@ static int 
amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
        adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout =
                adev->video_timeout = msecs_to_jiffies(2000);
 
-       if (!strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH))
+       if (!strnlen(amdgpu_lockup_timeout, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH))
                return 0;
 
+       /*
+        * strsep() destructively modifies its input by replacing delimiters
+        * with '\0'. Make a local copy so the global module parameter buffer
+        * remains intact for multi-GPU systems where this function is called
+        * once per device.
+        */
+       input = kstrdup(amdgpu_lockup_timeout, GFP_KERNEL);
+       if (!input)
+               return -ENOMEM;
+       input_copy = input;
+
        while ((timeout_setting = strsep(&input, ",")) &&
               strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
                ret = kstrtol(timeout_setting, 0, &timeout);
                if (ret)
-                       return ret;
+                       goto out_free;
 
                if (timeout == 0) {
                        index++;
@@ -3551,6 +3562,8 @@ static int amdgpu_device_get_job_timeout_settings(struct 
amdgpu_device *adev)
                adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout =
                        adev->video_timeout = timeout;
 
+out_free:
+       kfree(input_copy);
        return ret;
 }
 
-- 
2.49.0.593.gd86a19f485

Reply via email to