Hi, On 2025-07-30 14:58:54 -0400, roman...@amd.com wrote: > From: Aurabindo Pillai <aurabindo.pil...@amd.com> > > Accessing DC from amdgpu_dm is usually preceded by acquisition of > dc_lock mutex. Most of the DC API that DM calls are under a DC lock. > However, there are a few that are not. Some DC API called from interrupt > context end up sending DMUB commands via a DC API, while other threads were > using DMUB. This was apparent from a race between calls for setting idle > optimization enable/disable and the DC API to set vmin/vmax. > > Offload the call to dc_stream_adjust_vmin_vmax() to a thread instead > of directly calling them from the interrupt handler such that it waits > for dc_lock. > > Reviewed-by: Nicholas Kazlauskas <nicholas.kazlaus...@amd.com> > Signed-off-by: Aurabindo Pillai <aurabindo.pil...@amd.com> > Signed-off-by: Roman Li <roman...@amd.com>
With this patch I get sleeping function called from invalid context roughly every second. This occurs on at least PREEMPT_LAZY and PREEMPT_VOLUNTARY. BUG: sleeping function called from invalid context at include/linux/sched/mm.h:321 in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 13841, name: cc1 preempt_count: 10002, expected: 0 CPU: 23 UID: 1000 PID: 13841 Comm: cc1 Kdump: loaded Tainted: G W 6.16.0-next-20250805 #631 PREEMPTLAZY Tainted: [W]=WARN Hardware name: Micro-Star International Co., Ltd. MS-7C91/MAG B550 TOMAHAWK (MS-7C91), BIOS A.I0 03/20/2025 Call Trace: <TASK> dump_stack_lvl+0x4b/0x70 __might_resched.cold+0xac/0xb9 __kmalloc_cache_noprof+0x2d4/0x430 ? schedule_dc_vmin_vmax+0x49/0x1c0 [amdgpu] ? __alloc_frozen_pages_noprof+0x18f/0x360 ? _raw_spin_unlock_irqrestore+0x12/0x40 ? schedule_dc_vmin_vmax+0x49/0x1c0 [amdgpu] schedule_dc_vmin_vmax+0x49/0x1c0 [amdgpu] dm_crtc_high_irq+0x2ab/0x310 [amdgpu] amdgpu_dm_irq_handler+0x8d/0x210 [amdgpu] amdgpu_irq_dispatch+0x166/0x1a0 [amdgpu] amdgpu_ih_process+0x60/0x160 [amdgpu] amdgpu_irq_handler+0x23/0x60 [amdgpu] __handle_irq_event_percpu+0x4a/0x1a0 handle_irq_event+0x38/0x90 handle_edge_irq+0xc4/0x190 __common_interrupt+0x44/0xe0 ? srso_return_thunk+0x5/0x5f common_interrupt+0x3a/0xa0 asm_common_interrupt+0x26/0x40 RIP: 0033:0x55d4848fab81 Code: 28 44 8b 5f 70 ff 87 80 00 00 00 49 89 d7 48 89 34 24 45 8d 73 ff 44 89 f3 21 cb 48 8b 4f 58 41 89 dd 49 c1 e5 03 4a 8b 2c 29 <48> 85 ed 0f 84 36 02 00 00 48 83 fd ff 0f 84 bc 00 00 00 44 39 4d RSP: 002b:00007fff71eeed50 EFLAGS: 00000212 RAX: 000000000000001f RBX: 0000000000002182 RCX: 00007f47b3f00010 RDX: 000000000000001f RSI: 000055d4bfce1e3c RDI: 000055d4bfbb2e80 RBP: 0000000000000000 R08: 0000000000000001 R09: 000000006d35e182 R10: 000055d4bfb94630 R11: 0000000000004000 R12: 000055d4bfbb2e80 R13: 0000000000010c10 R14: 0000000000003fff R15: 000000000000001f </TASK> ... > +static void schedule_dc_vmin_vmax(struct amdgpu_device *adev, > + struct dc_stream_state *stream, > + struct dc_crtc_timing_adjust *adjust) > +{ > + struct vupdate_offload_work *offload_work = > kzalloc(sizeof(*offload_work), GFP_KERNEL); > + if (!offload_work) { > + drm_dbg_driver(adev_to_drm(adev), "Failed to allocate > vupdate_offload_work\n"); > + return; > + } > + > + struct dc_crtc_timing_adjust *adjust_copy = > kzalloc(sizeof(*adjust_copy), GFP_KERNEL); > + if (!adjust_copy) { > + drm_dbg_driver(adev_to_drm(adev), "Failed to allocate > adjust_copy\n"); > + kfree(offload_work); > + return; > + } > + > + dc_stream_retain(stream); > + memcpy(adjust_copy, adjust, sizeof(*adjust_copy)); > + > + INIT_WORK(&offload_work->work, dm_handle_vmin_vmax_update); > + offload_work->adev = adev; > + offload_work->stream = stream; > + offload_work->adjust = adjust_copy; > + > + queue_work(system_wq, &offload_work->work); > +} > + The allocations in this function seems to be the culprit. If I change them to GFP_ATOMIC the issue disappears: diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 31ea57edeb45..afe0fea13bb0 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -550,13 +550,13 @@ static void schedule_dc_vmin_vmax(struct amdgpu_device *adev, struct dc_stream_state *stream, struct dc_crtc_timing_adjust *adjust) { - struct vupdate_offload_work *offload_work = kzalloc(sizeof(*offload_work), GFP_KERNEL); + struct vupdate_offload_work *offload_work = kzalloc(sizeof(*offload_work), GFP_ATOMIC); if (!offload_work) { drm_dbg_driver(adev_to_drm(adev), "Failed to allocate vupdate_offload_work\n"); return; } - struct dc_crtc_timing_adjust *adjust_copy = kzalloc(sizeof(*adjust_copy), GFP_KERNEL); + struct dc_crtc_timing_adjust *adjust_copy = kzalloc(sizeof(*adjust_copy), GFP_ATOMIC); if (!adjust_copy) { drm_dbg_driver(adev_to_drm(adev), "Failed to allocate adjust_copy\n"); kfree(offload_work); ... Regards, Klara Modin