On Sun, 15 Feb 2026 at 21:41, Mike Lothian <[email protected]> wrote:
> > > On Thu, 25 Sept 2025 at 14:03, Christian König < > [email protected]> wrote: > >> There has been multiple complains that 10 seconds are usually to long. >> >> The original requirement for longer timeout came from compute tests on >> AMDVLK, since that is no longer a topic reduce the timeout back to 2 >> seconds for all queues. >> >> While at it also remove any special handling for compute queues under >> SRIOV or pass through. >> >> Signed-off-by: Christian König <[email protected]> >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 85 ++++++++++------------ >> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 21 ++---- >> 2 files changed, 48 insertions(+), 58 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> index a77000c2e0bb..ceb3c616292c 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> @@ -4278,58 +4278,53 @@ static int >> amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) >> long timeout; >> int ret = 0; >> >> - /* >> - * By default timeout for jobs is 10 sec >> - */ >> - adev->compute_timeout = adev->gfx_timeout = >> msecs_to_jiffies(10000); >> - adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; >> + /* By default timeout for all queues is 2 sec */ >> + adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout = >> + adev->video_timeout = msecs_to_jiffies(2000); >> >> - if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { >> - while ((timeout_setting = strsep(&input, ",")) && >> - strnlen(timeout_setting, >> AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { >> - ret = kstrtol(timeout_setting, 0, &timeout); >> - if (ret) >> - return ret; >> + if (!strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) >> + return 0; >> >> - if (timeout == 0) { >> - index++; >> - continue; >> - } else if (timeout < 0) { >> - timeout = MAX_SCHEDULE_TIMEOUT; >> - dev_warn(adev->dev, "lockup timeout >> disabled"); >> - add_taint(TAINT_SOFTLOCKUP, >> LOCKDEP_STILL_OK); >> - } else { >> - timeout = msecs_to_jiffies(timeout); >> - } >> + while ((timeout_setting = strsep(&input, ",")) && >> + strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) >> { >> + ret = kstrtol(timeout_setting, 0, &timeout); >> + if (ret) >> + return ret; >> >> - switch (index++) { >> - case 0: >> - adev->gfx_timeout = timeout; >> - break; >> - case 1: >> - adev->compute_timeout = timeout; >> - break; >> - case 2: >> - adev->sdma_timeout = timeout; >> - break; >> - case 3: >> - adev->video_timeout = timeout; >> - break; >> - default: >> - break; >> - } >> + if (timeout == 0) { >> + index++; >> + continue; >> + } else if (timeout < 0) { >> + timeout = MAX_SCHEDULE_TIMEOUT; >> + dev_warn(adev->dev, "lockup timeout disabled"); >> + add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); >> + } else { >> + timeout = msecs_to_jiffies(timeout); >> } >> - /* >> - * There is only one value specified and >> - * it should apply to all non-compute jobs. >> - */ >> - if (index == 1) { >> - adev->sdma_timeout = adev->video_timeout = >> adev->gfx_timeout; >> - if (amdgpu_sriov_vf(adev) || >> amdgpu_passthrough(adev)) >> - adev->compute_timeout = adev->gfx_timeout; >> + >> + switch (index++) { >> + case 0: >> + adev->gfx_timeout = timeout; >> + break; >> + case 1: >> + adev->compute_timeout = timeout; >> + break; >> + case 2: >> + adev->sdma_timeout = timeout; >> + break; >> + case 3: >> + adev->video_timeout = timeout; >> + break; >> + default: >> + break; >> } >> } >> >> + /* When only one value specified apply it to all queues. */ >> + if (index == 1) >> + adev->gfx_timeout = adev->compute_timeout = >> adev->sdma_timeout = >> + adev->video_timeout = timeout; >> + >> return ret; >> } >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c >> index ece251cbe8c3..fe45dd1d979e 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c >> @@ -353,22 +353,17 @@ module_param_named(svm_default_granularity, >> amdgpu_svm_default_granularity, uint >> * DOC: lockup_timeout (string) >> * Set GPU scheduler timeout value in ms. >> * >> - * The format can be [Non-Compute] or [GFX,Compute,SDMA,Video]. That is >> there can be one or >> - * multiple values specified. 0 and negative values are invalidated. >> They will be adjusted >> - * to the default timeout. >> + * The format can be [single value] for setting all timeouts at once or >> + * [GFX,Compute,SDMA,Video] to set individual timeouts. >> + * Negative values mean infinity. >> * >> - * - With one value specified, the setting will apply to all non-compute >> jobs. >> - * - With multiple values specified, the first one will be for GFX. >> - * The second one is for Compute. The third and fourth ones are >> - * for SDMA and Video. >> - * >> - * By default(with no lockup_timeout settings), the timeout for all jobs >> is 10000. >> + * By default(with no lockup_timeout settings), the timeout for all >> queues is 2000. >> */ >> MODULE_PARM_DESC(lockup_timeout, >> - "GPU lockup timeout in ms (default: 10000 for all jobs. " >> - "0: keep default value. negative: infinity timeout), >> format: for bare metal [Non-Compute] or [GFX,Compute,SDMA,Video]; " >> - "for passthrough or sriov [all jobs] or >> [GFX,Compute,SDMA,Video]."); >> -module_param_string(lockup_timeout, amdgpu_lockup_timeout, >> sizeof(amdgpu_lockup_timeout), 0444); >> + "GPU lockup timeout in ms (default: 2000 for all queues. >> " >> + "0: keep default value. negative: infinity timeout), >> format: [single value for all] or [GFX,Compute,SDMA,Video]."); >> +module_param_string(lockup_timeout, amdgpu_lockup_timeout, >> + sizeof(amdgpu_lockup_timeout), 0444); >> >> /** >> * DOC: dpm (int) >> -- >> 2.43.0 >> >> Hi > > This patch is causing issues with running: > > ~/GravityMark_1.89_linux $ DRI_PRIME=1 ./run_fullscreen_vk_rt.sh > > M: 0 us: ../data.zip: 313 files > M: 15.19 ms: Temporal antialiasing > M: 15.21 ms: Fullscreen mode > M: 15.22 ms: Render Statistics > M: 20.77 ms: Build Date: Jun 20 2025 > M: 20.80 ms: Build Info: version=20250429; linux; x64; release; vk=1; > gl=45; gles=32; cu=1; fusion > M: 20.81 ms: Build Version: 1.89 > M: 48.06 ms: Name: ASUSTeK COMPUTER INC. G513QY ROG Strix G513QY_G513QY > M: 48.09 ms: System: 'Gentoo Linux' > M: 48.10 ms: Kernel: Linux 6.19.0-rc7-drm+ x86_64 > M: 48.11 ms: Memory: 62.19 GB > M: 48.13 ms: Uptime: 19.00 s > M: 48.15 ms: CPU: AMD Ryzen 9 5900HX with Radeon Graphics > M: 48.17 ms: GPU 0: [AMD/ATI] Navi 22 [Radeon RX 6700/6700 XT/6750 XT / > 6800M/6850M XT] (rev c3) > M: 48.18 ms: Device: VEN_1002&DEV_73DF&SUBSYS_16C21043 > M: 48.19 ms: Memory: 11.98 GB > M: 48.21 ms: GPU 1: [AMD/ATI] Cezanne [Radeon Vega Series / Radeon Vega > Mobile Series] (rev c4) > M: 48.22 ms: Device: VEN_1002&DEV_1638&SUBSYS_16C21043 > M: 48.23 ms: Memory: 512.00 MB > M: 48.53 ms: Desktop: 2560x1440 1.0 > M: 48.55 ms: Screen 0: 2560x1440 0 0 eDP-1 > M: 48.57 ms: Set fullscreen mode on 0 screen > M: 51.48 ms: Creating 2560x1440 Vulkan Window > M: 147.88 ms: Render Size: 2560x1440 > M: 149.22 ms: Using Fetch Mode > M: 233.88 ms: Device: AMD Radeon RX 6800M (RADV NAVI22) > M: 233.95 ms: Vendor: AMD > M: 233.96 ms: Version: 26.0.99 > M: 233.97 ms: DeviceID: 0x73df > M: 234.35 ms: Group Memory: 64.00 KB > M: 234.36 ms: Video Memory: 11.98 GB > M: 234.37 ms: Max Uniform Size: 4.00 GB > M: 234.38 ms: Max Storage Size: 4.00 GB > M: 234.38 ms: Creating SceneManager > M: 416.06 ms: Creating RenderManager > M: 547.17 ms: Ray Tracing Mode > M: 547.20 ms: Creating Scene > M: 1.481 s: Creating 200,000 Asteroids > M: 1.600 s: Updating Scene > M: 1.751 s: GravityMark 1.89 Vulkan RT is Ready in 1.7 s > M: 1.751 s: Starting 2560x1440 Vulkan RT Benchmark > M: 1.751 s: Count: 1 > M: 1.752 s: Resizing 2560x1440 frame > M: 1.753 s: Build buffer 44.74 MB > radv/amdgpu: The CS has been cancelled because the context is lost. This > context is guilty of a hard recovery. > E: 4.151 s: VK::error(): device lost > E: 4.152 s: VKContext::Frame::submit(): can't submit command buffer > E: 4.152 s: VKContext::submit(): can't submit frame > E: 4.152 s: VKWindow::present(): can't submit context > E: 4.152 s: GravityMark::render(): can't present window > E: 9.347 s: VK::error(): device lost > E: 9.347 s: VKContext::Frame::wait(): can't wait for fence > E: 9.347 s: VKContext::finish(): can't wait frame > E: 9.347 s: VK::error(): device lost > E: 9.347 s: VKContext::Frame::wait(): can't wait for fence > E: 9.347 s: VKContext::finish(): can't wait frame > E: 9.347 s: VK::error(): device lost > E: 9.347 s: VKContext::Frame::wait(): can't wait for fence > E: 9.347 s: VKContext::finish(): can't wait frame > E: 9.347 s: VKWindow::finish(): can't finish context > M: 9.347 s: Clearing Scene > E: 9.575 s: VK::error(): device lost > E: 9.575 s: VKContext::Frame::wait(): can't wait for fence > E: 9.575 s: VKContext::finish(): can't wait frame > M: 9.575 s: Restore fullscreen mode on 0 screen > E: 9.583 s: VK::error(): device lost > E: 9.583 s: VKContext::Frame::wait(): can't wait for fence > E: 9.583 s: VKContext::finish(): can't wait frame > E: 9.583 s: VK::error(): device lost > E: 9.583 s: VKContext::Frame::wait(): can't wait for fence > E: 9.583 s: VKContext::finish(): can't wait frame > > > It's only the full screen and RT that seem to have issues > > Dmesg: > > Feb 15 21:16:13 axion.fireburn.co.uk kernel: [drm] PCIE GART of 512M > enabled (table at 0x00000082FEB00000). > Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: > PSP is resuming... > Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: > reserve 0xa00000 from 0x82fd000000 for PSP TMR > Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: > RAS: optional ras ta ucode is not available > Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: > SECUREDISPLAY: optional securedisplay ta ucode is not available > Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: > SMU is resuming... > Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: > smu driver if version = 0x0000000e, smu fw if version = 0x00000012, smu fw > program = 0, version = 0x00413f00 (65.63.0) > Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: > SMU driver if version not matched > Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: > Setting new power limit is not supported! > Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: > SMU is resumed successfully! > Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: > kiq ring mec 2 pipe 1 q 0 > Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: > [drm] DMUB hardware initialized: version=0x02020021 > Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: [drm] > Cannot find any crtc or sizes > Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: > ring gfx_0.0.0 uses VM inv eng 0 on hub 0 > Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: > ring gfx_0.1.0 uses VM inv eng 1 on hub 0 > Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: > ring comp_1.0.0 uses VM inv eng 4 on hub 0 > Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: > ring comp_1.1.0 uses VM inv eng 5 on hub 0 > Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: > ring comp_1.2.0 uses VM inv eng 6 on hub 0 > Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: > ring comp_1.3.0 uses VM inv eng 7 on hub 0 > Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: > ring comp_1.0.1 uses VM inv eng 8 on hub 0 > Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: > ring comp_1.1.1 uses VM inv eng 9 on hub 0 > Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: > ring comp_1.2.1 uses VM inv eng 10 on hub 0 > Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: > ring comp_1.3.1 uses VM inv eng 11 on hub 0 > Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: > ring kiq_0.2.1.0 uses VM inv eng 12 on hub 0 > Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: > ring sdma0 uses VM inv eng 13 on hub 0 > Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: > ring sdma1 uses VM inv eng 14 on hub 0 > Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: > ring vcn_dec_0 uses VM inv eng 0 on hub 8 > Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: > ring vcn_enc_0.0 uses VM inv eng 1 on hub 8 > Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: > ring vcn_enc_0.1 uses VM inv eng 4 on hub 8 > Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: > ring jpeg_dec uses VM inv eng 5 on hub 8 > Feb 15 21:16:13 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: [drm] > Cannot find any crtc or sizes > Feb 15 21:16:17 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: > Dumping IP State > Feb 15 21:16:17 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: > Dumping IP State Completed > Feb 15 21:16:17 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: > [drm] AMDGPU device coredump file has been created > Feb 15 21:16:17 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: > [drm] Check your /sys/class/drm/card0/device/devcoredump/data > Feb 15 21:16:17 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: > ring gfx_0.0.0 timeout, signaled seq=99, emitted seq=100 > Feb 15 21:16:17 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: > Process GravityMark.x64 pid 1794 thread GravityMark.x64 pid 1794 > Feb 15 21:16:17 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: > Starting gfx_0.0.0 ring reset > Feb 15 21:16:17 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: amdgpu: > Ring gfx_0.0.0 reset succeeded > Feb 15 21:16:17 axion.fireburn.co.uk kernel: amdgpu 0000:03:00.0: [drm] > device wedged, but recovered through reset > > I got things working with this patch: > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index c7f44422939f..5a3f02a26192 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -4206,7 +4206,7 @@ static int > amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) > > /* By default timeout for all queues is 2 sec */ > adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout = > - adev->video_timeout = msecs_to_jiffies(2000); > + adev->video_timeout = msecs_to_jiffies(5000); > > if (!strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) > return 0; > > Also 3000 works too
