Re: [PATCH] drm/amdgpu: reduce queue timeout to 2 seconds

Mike Lothian Sun, 15 Feb 2026 13:49:29 -0800

On Sun, 15 Feb 2026 at 21:41, Mike Lothian <[email protected]> wrote:


>
>
> On Thu, 25 Sept 2025 at 14:03, Christian König <
> [email protected]> wrote:
>
>> There has been multiple complains that 10 seconds are usually to long.
>>
>> The original requirement for longer timeout came from compute tests on
>> AMDVLK, since that is no longer a topic reduce the timeout back to 2
>> seconds for all queues.
>>
>> While at it also remove any special handling for compute queues under
>> SRIOV or pass through.
>>
>> Signed-off-by: Christian König <[email protected]>
>> ---
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 85 ++++++++++------------
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    | 21 ++----
>>  2 files changed, 48 insertions(+), 58 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index a77000c2e0bb..ceb3c616292c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -4278,58 +4278,53 @@ static int
>> amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
>>         long timeout;
>>         int ret = 0;
>>
>> -       /*
>> -        * By default timeout for jobs is 10 sec
>> -        */
>> -       adev->compute_timeout = adev->gfx_timeout =
>> msecs_to_jiffies(10000);
>> -       adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
>> +       /* By default timeout for all queues is 2 sec */
>> +       adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout =
>> +               adev->video_timeout = msecs_to_jiffies(2000);
>>
>> -       if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
>> -               while ((timeout_setting = strsep(&input, ",")) &&
>> -                               strnlen(timeout_setting,
>> AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
>> -                       ret = kstrtol(timeout_setting, 0, &timeout);
>> -                       if (ret)
>> -                               return ret;
>> +       if (!strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH))
>> +               return 0;
>>
>> -                       if (timeout == 0) {
>> -                               index++;
>> -                               continue;
>> -                       } else if (timeout < 0) {
>> -                               timeout = MAX_SCHEDULE_TIMEOUT;
>> -                               dev_warn(adev->dev, "lockup timeout
>> disabled");
>> -                               add_taint(TAINT_SOFTLOCKUP,
>> LOCKDEP_STILL_OK);
>> -                       } else {
>> -                               timeout = msecs_to_jiffies(timeout);
>> -                       }
>> +       while ((timeout_setting = strsep(&input, ",")) &&
>> +              strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH))
>> {
>> +               ret = kstrtol(timeout_setting, 0, &timeout);
>> +               if (ret)
>> +                       return ret;
>>
>> -                       switch (index++) {
>> -                       case 0:
>> -                               adev->gfx_timeout = timeout;
>> -                               break;
>> -                       case 1:
>> -                               adev->compute_timeout = timeout;
>> -                               break;
>> -                       case 2:
>> -                               adev->sdma_timeout = timeout;
>> -                               break;
>> -                       case 3:
>> -                               adev->video_timeout = timeout;
>> -                               break;
>> -                       default:
>> -                               break;
>> -                       }
>> +               if (timeout == 0) {
>> +                       index++;
>> +                       continue;
>> +               } else if (timeout < 0) {
>> +                       timeout = MAX_SCHEDULE_TIMEOUT;
>> +                       dev_warn(adev->dev, "lockup timeout disabled");
>> +                       add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
>> +               } else {
>> +                       timeout = msecs_to_jiffies(timeout);
>>                 }
>> -               /*
>> -                * There is only one value specified and
>> -                * it should apply to all non-compute jobs.
>> -                */
>> -               if (index == 1) {
>> -                       adev->sdma_timeout = adev->video_timeout =
>> adev->gfx_timeout;
>> -                       if (amdgpu_sriov_vf(adev) ||
>> amdgpu_passthrough(adev))
>> -                               adev->compute_timeout = adev->gfx_timeout;
>> +
>> +               switch (index++) {
>> +               case 0:
>> +                       adev->gfx_timeout = timeout;
>> +                       break;
>> +               case 1:
>> +                       adev->compute_timeout = timeout;
>> +                       break;
>> +               case 2:
>> +                       adev->sdma_timeout = timeout;
>> +                       break;
>> +               case 3:
>> +                       adev->video_timeout = timeout;
>> +                       break;
>> +               default:
>> +                       break;
>>                 }
>>         }
>>
>> +       /* When only one value specified apply it to all queues. */
>> +       if (index == 1)
>> +               adev->gfx_timeout = adev->compute_timeout =
>> adev->sdma_timeout =
>> +                       adev->video_timeout = timeout;
>> +
>>         return ret;
>>  }
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> index ece251cbe8c3..fe45dd1d979e 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> @@ -353,22 +353,17 @@ module_param_named(svm_default_granularity,
>> amdgpu_svm_default_granularity, uint
>>   * DOC: lockup_timeout (string)
>>   * Set GPU scheduler timeout value in ms.
>>   *
>> - * The format can be [Non-Compute] or [GFX,Compute,SDMA,Video]. That is
>> there can be one or
>> - * multiple values specified. 0 and negative values are invalidated.
>> They will be adjusted
>> - * to the default timeout.
>> + * The format can be [single value] for setting all timeouts at once or
>> + * [GFX,Compute,SDMA,Video] to set individual timeouts.
>> + * Negative values mean infinity.
>>   *
>> - * - With one value specified, the setting will apply to all non-compute
>> jobs.
>> - * - With multiple values specified, the first one will be for GFX.
>> - *   The second one is for Compute. The third and fourth ones are
>> - *   for SDMA and Video.
>> - *
>> - * By default(with no lockup_timeout settings), the timeout for all jobs
>> is 10000.
>> + * By default(with no lockup_timeout settings), the timeout for all
>> queues is 2000.
>>   */
>>  MODULE_PARM_DESC(lockup_timeout,
>> -                "GPU lockup timeout in ms (default: 10000 for all jobs. "
>> -                "0: keep default value. negative: infinity timeout),
>> format: for bare metal [Non-Compute] or [GFX,Compute,SDMA,Video]; "
>> -                "for passthrough or sriov [all jobs] or
>> [GFX,Compute,SDMA,Video].");
>> -module_param_string(lockup_timeout, amdgpu_lockup_timeout,
>> sizeof(amdgpu_lockup_timeout), 0444);
>> +                "GPU lockup timeout in ms (default: 2000 for all queues.
>> "
>> +                "0: keep default value. negative: infinity timeout),
>> format: [single value for all] or [GFX,Compute,SDMA,Video].");
>> +module_param_string(lockup_timeout, amdgpu_lockup_timeout,
>> +                   sizeof(amdgpu_lockup_timeout), 0444);
>>
>>  /**
>>   * DOC: dpm (int)
>> --
>> 2.43.0
>>
>> Hi
>
> This patch is causing issues with running:
>
>   ~/GravityMark_1.89_linux $ DRI_PRIME=1 ./run_fullscreen_vk_rt.sh
>
> M:      0 us: ../data.zip: 313 files
> M:  15.19 ms: Temporal antialiasing
> M:  15.21 ms: Fullscreen mode
> M:  15.22 ms: Render Statistics
> M:  20.77 ms: Build Date: Jun 20 2025
> M:  20.80 ms: Build Info: version=20250429; linux; x64; release; vk=1;
> gl=45; gles=32; cu=1; fusion
> M:  20.81 ms: Build Version: 1.89
> M:  48.06 ms: Name: ASUSTeK COMPUTER INC. G513QY ROG Strix G513QY_G513QY
> M:  48.09 ms: System: 'Gentoo Linux'
> M:  48.10 ms: Kernel: Linux 6.19.0-rc7-drm+ x86_64
> M:  48.11 ms: Memory: 62.19 GB
> M:  48.13 ms: Uptime: 19.00 s
> M:  48.15 ms: CPU: AMD Ryzen 9 5900HX with Radeon Graphics
> M:  48.17 ms: GPU 0: [AMD/ATI] Navi 22 [Radeon RX 6700/6700 XT/6750 XT /
> 6800M/6850M XT] (rev c3)
> M:  48.18 ms: Device: VEN_1002&DEV_73DF&SUBSYS_16C21043
> M:  48.19 ms: Memory: 11.98 GB
> M:  48.21 ms: GPU 1: [AMD/ATI] Cezanne [Radeon Vega Series / Radeon Vega
> Mobile Series] (rev c4)
> M:  48.22 ms: Device: VEN_1002&DEV_1638&SUBSYS_16C21043
> M:  48.23 ms: Memory: 512.00 MB
> M:  48.53 ms: Desktop: 2560x1440 1.0
> M:  48.55 ms: Screen 0: 2560x1440 0 0 eDP-1
> M:  48.57 ms: Set fullscreen mode on 0 screen
> M:  51.48 ms: Creating 2560x1440 Vulkan Window
> M: 147.88 ms: Render Size: 2560x1440
> M: 149.22 ms: Using Fetch Mode
> M: 233.88 ms: Device: AMD Radeon RX 6800M (RADV NAVI22)
> M: 233.95 ms: Vendor: AMD
> M: 233.96 ms: Version: 26.0.99
> M: 233.97 ms: DeviceID: 0x73df
> M: 234.35 ms: Group Memory: 64.00 KB
> M: 234.36 ms: Video Memory: 11.98 GB
> M: 234.37 ms: Max Uniform Size: 4.00 GB
> M: 234.38 ms: Max Storage Size: 4.00 GB
> M: 234.38 ms: Creating SceneManager
> M: 416.06 ms: Creating RenderManager
> M: 547.17 ms: Ray Tracing Mode
> M: 547.20 ms: Creating Scene
> M:   1.481 s: Creating 200,000 Asteroids
> M:   1.600 s: Updating Scene
> M:   1.751 s: GravityMark 1.89 Vulkan RT is Ready in 1.7 s
> M:   1.751 s: Starting 2560x1440 Vulkan RT Benchmark
> M:   1.751 s: Count: 1
> M:   1.752 s: Resizing 2560x1440 frame
> M:   1.753 s: Build buffer 44.74 MB
> radv/amdgpu: The CS has been cancelled because the context is lost. This
> context is guilty of a hard recovery.
> E:   4.151 s: VK::error(): device lost
> E:   4.152 s: VKContext::Frame::submit(): can't submit command buffer
> E:   4.152 s: VKContext::submit(): can't submit frame
> E:   4.152 s: VKWindow::present(): can't submit context
> E:   4.152 s: GravityMark::render(): can't present window
> E:   9.347 s: VK::error(): device lost
> E:   9.347 s: VKContext::Frame::wait(): can't wait for fence
> E:   9.347 s: VKContext::finish(): can't wait frame
> E:   9.347 s: VK::error(): device lost
> E:   9.347 s: VKContext::Frame::wait(): can't wait for fence
> E:   9.347 s: VKContext::finish(): can't wait frame
> E:   9.347 s: VK::error(): device lost
> E:   9.347 s: VKContext::Frame::wait(): can't wait for fence
> E:   9.347 s: VKContext::finish(): can't wait frame
> E:   9.347 s: VKWindow::finish(): can't finish context
> M:   9.347 s: Clearing Scene
> E:   9.575 s: VK::error(): device lost
> E:   9.575 s: VKContext::Frame::wait(): can't wait for fence
> E:   9.575 s: VKContext::finish(): can't wait frame
> M:   9.575 s: Restore fullscreen mode on 0 screen
> E:   9.583 s: VK::error(): device lost
> E:   9.583 s: VKContext::Frame::wait(): can't wait for fence
> E:   9.583 s: VKContext::finish(): can't wait frame
> E:   9.583 s: VK::error(): device lost
> E:   9.583 s: VKContext::Frame::wait(): can't wait for fence
> E:   9.583 s: VKContext::finish(): can't wait frame
>
>
> It's only the full screen and RT that seem to have issues
>
> Dmesg:
>
> Feb 15 21:16:13 axion.fireburn.co.uk kernel: [drm] PCIE GART of 512M
> enabled (table at 0x00000082FEB00000).
> Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
> PSP is resuming...
> Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
> reserve 0xa00000 from 0x82fd000000 for PSP TMR
> Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
> RAS: optional ras ta ucode is not available
> Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
> SECUREDISPLAY: optional securedisplay ta ucode is not available
> Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
> SMU is resuming...
> Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
> smu driver if version = 0x0000000e, smu fw if version = 0x00000012, smu fw
> program = 0, version = 0x00413f00 (65.63.0)
> Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
> SMU driver if version not matched
> Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
> Setting new power limit is not supported!
> Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
> SMU is resumed successfully!
> Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
> kiq ring mec 2 pipe 1 q 0
> Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
> [drm] DMUB hardware initialized: version=0x02020021
> Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: [drm]
> Cannot find any crtc or sizes
> Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
> ring gfx_0.0.0 uses VM inv eng 0 on hub 0
> Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
> ring gfx_0.1.0 uses VM inv eng 1 on hub 0
> Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
> ring comp_1.0.0 uses VM inv eng 4 on hub 0
> Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
> ring comp_1.1.0 uses VM inv eng 5 on hub 0
> Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
> ring comp_1.2.0 uses VM inv eng 6 on hub 0
> Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
> ring comp_1.3.0 uses VM inv eng 7 on hub 0
> Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
> ring comp_1.0.1 uses VM inv eng 8 on hub 0
> Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
> ring comp_1.1.1 uses VM inv eng 9 on hub 0
> Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
> ring comp_1.2.1 uses VM inv eng 10 on hub 0
> Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
> ring comp_1.3.1 uses VM inv eng 11 on hub 0
> Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
> ring kiq_0.2.1.0 uses VM inv eng 12 on hub 0
> Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
> ring sdma0 uses VM inv eng 13 on hub 0
> Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
> ring sdma1 uses VM inv eng 14 on hub 0
> Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
> ring vcn_dec_0 uses VM inv eng 0 on hub 8
> Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
> ring vcn_enc_0.0 uses VM inv eng 1 on hub 8
> Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
> ring vcn_enc_0.1 uses VM inv eng 4 on hub 8
> Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
> ring jpeg_dec uses VM inv eng 5 on hub 8
> Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: [drm]
> Cannot find any crtc or sizes
> Feb 15 21:16:17 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
> Dumping IP State
> Feb 15 21:16:17 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
> Dumping IP State Completed
> Feb 15 21:16:17 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
> [drm] AMDGPU device coredump file has been created
> Feb 15 21:16:17 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
> [drm] Check your /sys/class/drm/card0/device/devcoredump/data
> Feb 15 21:16:17 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
> ring gfx_0.0.0 timeout, signaled seq=99, emitted seq=100
> Feb 15 21:16:17 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
>  Process GravityMark.x64 pid 1794 thread GravityMark.x64 pid 1794
> Feb 15 21:16:17 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
> Starting gfx_0.0.0 ring reset
> Feb 15 21:16:17 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu:
> Ring gfx_0.0.0 reset succeeded
> Feb 15 21:16:17 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: [drm]
> device wedged, but recovered through reset
>
> I got things working with this patch:
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index c7f44422939f..5a3f02a26192 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4206,7 +4206,7 @@ static int
> amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
>
>        /* By default timeout for all queues is 2 sec */
>        adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout =
> -               adev->video_timeout = msecs_to_jiffies(2000);
> +               adev->video_timeout = msecs_to_jiffies(5000);
>
>        if (!strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH))
>                return 0;
>
>
Also 3000 works too

Re: [PATCH] drm/amdgpu: reduce queue timeout to 2 seconds

Reply via email to