AMD General
Hi Timur:
The new link detection delay workqueue causes hard hang on Dell 6K DP2
monitor after a couple of hotplug. Please refer to below dmesg as reference.
[ 182.455489] watchdog: CPU1: Watchdog detected hard LOCKUP on cpu 1
[ 182.455490] Modules linked in: snd_seq_dummy snd_hrtimer qrtr intel_rapl_msr
amd_atl intel_rapl_common binfmt_misc edac_mce_amd kvm_amd nls_iso8859_1
snd_hda_codec_atihdmi snd_hda_codec_hdmi kvm amdgpu snd_hda_intel rapl
eeepc_wmi amdxcp snd_hda_codec asus_wmi drm_panel_backlight_quirks
sparse_keymap platform_profile wmi_bmof snd_usb_audio snd_usbmidi_lib
snd_hda_core gpu_sched drm_buddy mc snd_intel_dspcfg drm_ttm_helper
snd_intel_sdw_acpi ttm drm_exec snd_seq_midi snd_seq_midi_event
drm_suballoc_helper snd_hwdep drm_client_lib snd_pcm snd_rawmidi
drm_display_helper i2c_piix4 k10temp i2c_smbus snd_seq drm_kms_helper r8169
snd_seq_device cec snd_timer snd rc_core i2c_algo_bit soundcore realtek
input_leds joydev gpio_amdpt gpio_generic acpi_pad mac_hid sch_fq_codel drm
efi_pstore nfnetlink hid_generic ucsi_acpi typec_ucsi typec usbhid hid
ghash_clmulni_intel ccp ahci libahci video thunderbolt wmi parport_pc lp ppdev
msr parport dmi_sysfs autofs4 aesni_intel
[ 182.455511] CPU: 1 UID: 0 PID: 262 Comm: kworker/1:1H Kdump: loaded Not
tainted 6.19.0-promotion-june2-patchy3+ #319 PREEMPT(voluntary)
[ 182.455513] Hardware name: ASUS System Product Name/ROG STRIX X870E-E GAMING
WIFI, BIOS 2202 04/09/2026
[ 182.455514] Workqueue: events_highpri dm_irq_work_func [amdgpu]
[ 182.455691] RIP: 0010:native_queued_spin_lock_slowpath+0x213/0x2c0
[ 182.455695] Code: 8d 46 01 41 c1 e5 10 c1 e0 12 41 09 c5 44 89 e8 c1 e8 10
66 87 43 02 89 c2 c1 e2 10 81 fa ff ff 00 00 77 51 31 d2 eb 02 f3 90 <8b> 03 66
85 c0 75 f7 44 39 e8 0f 84 85 00 00 00 c6 03 01 48 85 d2
[ 182.455696] RSP: 0018:ffffd17bc08e3ce0 EFLAGS: 00000006
[ 182.455697] RAX: 0000000000081000 RBX: ffffffff8d4176ff RCX: 0000000000000000
[ 182.455698] RDX: 0000000000000000 RSI: ffffffff8ca9f20c RDI: ffffffff8ca88b37
[ 182.455698] RBP: ffffd17bc08e3d00 R08: 8080808080808080 R09: ffff8a5c8186b100
[ 182.455698] R10: ffff8a5c80052ec0 R11: fefefefefefefeff R12: ffff8a63ac2731c0
[ 182.455699] R13: 0000000000080000 R14: 0000000000000001 R15: 0000000000000000
[ 182.455699] FS: 0000000000000000(0000) GS:ffff8a641ee49000(0000)
knlGS:0000000000000000
[ 182.455700] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 182.455700] CR2: 0000562bd6ca9000 CR3: 0000000200436000 CR4: 0000000000750ef0
[ 182.455701] PKRU: 55555554
[ 182.455701] Call Trace:
[ 182.455701] <TASK>
[ 182.455702] _raw_spin_lock_irqsave+0x4b/0x60
[ 182.455704] lock_timer_base+0x73/0xa0
[ 182.455707] timer_delete+0x36/0x80
[ 182.455708] try_to_grab_pending+0x121/0x1d0
[ 182.455710] __cancel_work+0x3b/0x100
[ 182.455711] ? finish_task_switch.isra.0+0x92/0x280
[ 182.455713] cancel_delayed_work+0x13/0x20
[ 182.455714] handle_hpd_irq+0x23/0x50 [amdgpu]
[ 182.455859] dm_irq_work_func+0x19/0x20 [amdgpu]
[ 182.455991] process_one_work+0x18f/0x3d0
[ 182.455992] worker_thread+0x2cf/0x410
[ 182.455993] ? _raw_spin_unlock_irqrestore+0x27/0x50
[ 182.455994] ? __pfx_worker_thread+0x10/0x10
[ 182.455995] kthread+0x11c/0x230
[ 182.455997] ? _raw_spin_unlock_irq+0x1f/0x40
[ 182.455998] ? __pfx_kthread+0x10/0x10
[ 182.455999] ret_from_fork+0x195/0x210
[ 182.456001] ? __pfx_kthread+0x10/0x10
[ 182.456002] ret_from_fork_asm+0x1a/0x30
[ 182.456004] </TASK>
[ 185.660573] amdgpu 0000:03:00.0: [drm] *ERROR* [CRTC:423:crtc-0] flip_done
timed out
[ 185.660587] amdgpu 0000:03:00.0: [drm:drm_atomic_state_default_clear [drm]]
Clearing atomic state 00000000f6b354ad
[ 185.660622] amdgpu 0000:03:00.0: [drm:__drm_atomic_state_free [drm]] Freeing
atomic state 00000000f6b354ad
[ 217.116563] rcu: INFO: rcu_preempt detected stalls on CPUs/tasks:
[ 217.116569] rcu: 1-...0: (9 GPs behind) idle=af8c/1/0x4000000000000000
softirq=14209/14209 fqs=5572
[ 217.116575] rcu: (detected by 13, t=15002 jiffies, g=21165, q=9426
ncpus=16)
[ 217.116578] Sending NMI from CPU 13 to CPUs 1:
[ 217.116580] NMI backtrace for cpu 1
[ 217.116581] CPU: 1 UID: 0 PID: 262 Comm: kworker/1:1H Kdump: loaded Not
tainted 6.19.0-promotion-june2-patchy3+ #319 PREEMPT(voluntary)
[ 217.116582] Hardware name: ASUS System Product Name/ROG STRIX X870E-E GAMING
WIFI, BIOS 2202 04/09/2026
[ 217.116582] Workqueue: events_highpri dm_irq_work_func [amdgpu]
[ 217.116721] RIP: 0010:native_queued_spin_lock_slowpath+0x213/0x2c0
[ 217.116723] Code: 8d 46 01 41 c1 e5 10 c1 e0 12 41 09 c5 44 89 e8 c1 e8 10
66 87 43 02 89 c2 c1 e2 10 81 fa ff ff 00 00 77 51 31 d2 eb 02 f3 90 <8b> 03 66
85 c0 75 f7 44 39 e8 0f 84 85 00 00 00 c6 03 01 48 85 d2
[ 217.116723] RSP: 0018:ffffd17bc08e3ce0 EFLAGS: 00000006
[ 217.116724] RAX: 0000000000081000 RBX: ffffffff8d4176ff RCX: 0000000000000000
[ 217.116724] RDX: 0000000000000000 RSI: ffffffff8ca9f20c RDI: ffffffff8ca88b37
[ 217.116725] RBP: ffffd17bc08e3d00 R08: 8080808080808080 R09: ffff8a5c8186b100
[ 217.116725] R10: ffff8a5c80052ec0 R11: fefefefefefefeff R12: ffff8a63ac2731c0
[ 217.116725] R13: 0000000000080000 R14: 0000000000000001 R15: 0000000000000000
[ 217.116725] FS: 0000000000000000(0000) GS:ffff8a641ee49000(0000)
knlGS:0000000000000000
[ 217.116726] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 217.116726] CR2: 0000562bd6ca9000 CR3: 0000000200436000 CR4: 0000000000750ef0
[ 217.116727] PKRU: 55555554
[ 217.116727] Call Trace:
[ 217.116727] <TASK>
[ 217.116727] _raw_spin_lock_irqsave+0x4b/0x60
[ 217.116728] lock_timer_base+0x73/0xa0
[ 217.116730] timer_delete+0x36/0x80
[ 217.116730] try_to_grab_pending+0x121/0x1d0
[ 217.116731] __cancel_work+0x3b/0x100
[ 217.116732] ? finish_task_switch.isra.0+0x92/0x280
[ 217.116733] cancel_delayed_work+0x13/0x20
[ 217.116734] handle_hpd_irq+0x23/0x50 [amdgpu]
[ 217.116866] dm_irq_work_func+0x19/0x20 [amdgpu]
[ 217.116992] process_one_work+0x18f/0x3d0
[ 217.116993] worker_thread+0x2cf/0x410
[ 217.116994] ? _raw_spin_unlock_irqrestore+0x27/0x50
[ 217.116995] ? __pfx_worker_thread+0x10/0x10
[ 217.116996] kthread+0x11c/0x230
[ 217.116997] ? _raw_spin_unlock_irq+0x1f/0x40
[ 217.116998] ? __pfx_kthread+0x10/0x10
[ 217.116999] ret_from_fork+0x195/0x210
[ 217.117000] ? __pfx_kthread+0x10/0x10
[ 217.117001] ret_from_fork_asm+0x1a/0x30
[ 217.117002] </TASK>
[ 244.249406] watchdog: BUG: soft lockup - CPU#3 stuck for 22s!
[kworker/3:1:161]
Regards,
Jerry
> -----Original Message-----
> From: Zuo, Jerry
> Sent: Friday, June 5, 2026 12:10
> To: Aurabindo Pillai <[email protected]>; amd-
> [email protected]
> Cc: Wentland, Harry <[email protected]>; Li, Sun peng (Leo)
> <[email protected]>; Pillai, Aurabindo <[email protected]>; Li,
> Roman <[email protected]>; Lin, Wayne <[email protected]>; Chung,
> ChiaHsuan (Tom) <[email protected]>; Wheeler, Daniel
> <[email protected]>; Wu, Ray <[email protected]>; LIPSKI, IVAN
> <[email protected]>; Hung, Alex <[email protected]>; Lin, Ping Lei
> <[email protected]>; Chen, Chen-Yu <[email protected]>; Timur
> Kristóf <[email protected]>
> Subject: RE: [PATCH 21/24] drm/amd/display: Retry link detection on hotplug
>
> Hi Timur:
>
> Please let me know whether you have validated the sequence by any SST
> and HDMI monitor. Basically it is to confirm the link retry workqueue is
> getting executed with hotplug and dpms use cases without introducing side
> effect. We've tested locally, but we don't see any link retry workqueue is
> getting executed by either SST or HDMI monitors. That makes us hard to
> validate the new error handling logic. It would be perfect that if you have
> done that at your local setup to confirm the new logic is regression-free for
> SST and HDMI.
>
> Apart from that, we find a regression in MST. MST has its own detection
> logic and should be separated from SST and HDMI.
>
> Regards,
> Jerry
>
> > -----Original Message-----
> > From: Aurabindo Pillai <[email protected]>
> > Sent: Thursday, June 4, 2026 10:52
> > To: [email protected]
> > Cc: Wentland, Harry <[email protected]>; Li, Sun peng (Leo)
> > <[email protected]>; Pillai, Aurabindo <[email protected]>;
> > Li, Roman <[email protected]>; Lin, Wayne <[email protected]>;
> Chung,
> > ChiaHsuan (Tom) <[email protected]>; Zuo, Jerry
> > <[email protected]>; Wheeler, Daniel <[email protected]>; Wu,
> Ray
> > <[email protected]>; LIPSKI, IVAN <[email protected]>; Hung, Alex
> > <[email protected]>; Lin, Ping Lei <[email protected]>; Chen, Chen-
> > Yu <[email protected]>; Timur Kristóf <[email protected]>
> > Subject: [PATCH 21/24] drm/amd/display: Retry link detection on
> > hotplug
> >
> > From: Timur Kristóf <[email protected]>
> >
> > When dc_link_detect_connection_type thinks that a display is
> > connected, but dc_link_detect failed, enqueue delayed work to retry the
> link detection again.
> >
> > Useful when eg. HPD pin is high but the display isn't ready and didn't
> > respond to DDC.
> >
> > - The display is "slow to wake up", ie. DDC isn't ready,
> > for example we couldn't read EDID. Can happen with any
> > connector type with certain "slow" displays.
> > Some displays may take up to 15~20 sec or more to wake up.
> >
> > - On hotplug, the HPD pin may make contact before the DDC pins,
> > so we couldn't read the EDID. This most often happens with
> > DVI connectors, rarely with HDMI. It is not impossible but
> > extremely rare with other connector types.
> >
> > Signed-off-by: Timur Kristóf <[email protected]>
> > Signed-off-by: Aurabindo Pillai <[email protected]>
> > Reviewed-by: Alex Hung <[email protected]>
> > ---
> > .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 138
> >
> ++++++++++++++++++ .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
> > | 16 ++
> > 2 files changed, 154 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > index d1b1eb67d937..40295a5edbec 100644
> > --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > @@ -161,6 +161,17 @@
> MODULE_FIRMWARE(FIRMWARE_DCN_42_DMUB);
> > #define FIRMWARE_DCN_42B_DMUB "amdgpu/dcn_4_2_1_dmcub.bin"
> > MODULE_FIRMWARE(FIRMWARE_DCN_42B_DMUB);
> >
> > +/**
> > + * define AMDGPU_DM_HPD_MAX_NUM_RETRIES - maximum amount of
> > retries for
> > +hotplug detection */ #define AMDGPU_DM_HPD_MAX_NUM_RETRIES 5
> > +
> > +/**
> > + * define AMDGPU_DM_HPD_RETRY_DELAY_MSEC - millisecond delay
> > between
> > +hotplug detection retries */ #define
> > AMDGPU_DM_HPD_RETRY_DELAY_MSEC
> > +1500
> > +
> > +
> > /**
> > * DOC: overview
> > *
> > @@ -959,6 +970,125 @@ static void dm_handle_hpd_work(struct
> > work_struct *work)
> >
> > }
> >
> > +/**
> > + * dm_handle_delayed_hpd_work() - Handle delayed HPD (hotplug
> > +detection)
> > + *
> > + * @w: Base work item structure
> > + *
> > + * Used for retrying HPD after a delay. Just calls the normal HPD helper.
> > + */
> > +static void dm_handle_delayed_hpd_work(struct work_struct *work) {
> > + struct delayed_work *dw = container_of(work, struct delayed_work,
> > work);
> > + struct delayed_hpd_work *w = container_of(dw, struct
> > delayed_hpd_work, work);
> > + struct amdgpu_dm_connector *aconn = w->aconn;
> > + enum dc_detect_reason reason = w->reason;
> > +
> > + kfree(w);
> > + handle_hpd_irq_helper(aconn, reason); }
> > +
> > +/**
> > + * dm_cancel_delayed_hpd_work() - Cancel pending hotplug detection
> > +work for a connector
> > + *
> > + * @aconnector: Connector on which the HPD event occurred */ static
> > +void dm_cancel_delayed_hpd_work(struct amdgpu_dm_connector
> > *aconnector)
> > +{
> > + if (!aconnector || !aconnector->delayed_hpd_work)
> > + return;
> > +
> > + cancel_delayed_work(&aconnector->delayed_hpd_work->work);
> > + aconnector->delayed_hpd_work = NULL; }
> > +
> > +/**
> > + * dm_cancel_all_delayed_hpd_work() - Cancel all pending hotplug
> > +detection work on the device
> > + *
> > + * @dev: DRM device pointer
> > + */
> > +static void dm_cancel_all_delayed_hpd_work(struct drm_device *dev) {
> > + struct drm_connector *connector;
> > + struct drm_connector_list_iter iter;
> > +
> > + drm_connector_list_iter_begin(dev, &iter);
> > + drm_for_each_connector_iter(connector, &iter) {
> > + if (connector->connector_type ==
> > DRM_MODE_CONNECTOR_WRITEBACK)
> > + continue;
> > +
> > +
> > dm_cancel_delayed_hpd_work(to_amdgpu_dm_connector(connecto
> > r));
> > + }
> > + drm_connector_list_iter_end(&iter);
> > +}
> > +
> > +/**
> > + * dm_queue_delayed_hpd_work() - Enqueue delayed work to handle
> > hotplug
> > +detection
> > + *
> > + * @aconnector: Connector on which the HPD event occurred
> > + * @reason: Reason why we are attempting the HPD
> > + * @msecs: Millisecond delay after which the delayed work is going to
> > +happen
> > + *
> > + * When dc_link_detect_connection_type thinks that a display is
> > +connected,
> > + * but dc_link_detect failed, enqueue delayed work to retry the link
> > + * detection again.
> > + *
> > + * Useful when eg. HPD pin is high but the display isn't ready and
> > + * didn't respond to DDC.
> > + *
> > + * - On boot or suspend/resume, the display is "slow to wake up",
> > + * ie. DDC isn't ready, for example we couldn't read DP link caps or
> > EDID.
> > + * Can happen to any connector with certain "slow" displays.
> > + *
> > + * - On hotplug, the HPD pin may make contact before the DDC pins,
> > + * so we couldn't read the EDID. Can happen to any connector but
> > + * most often to DVI and sometimes to HDMI (rarely to DP).
> > + *
> > + */
> > +static void dm_queue_delayed_hpd_work(struct amdgpu_dm_connector
> > *aconnector,
> > + const enum dc_detect_reason reason,
> > + const unsigned int msecs)
> > +{
> > + struct drm_device *dev = aconnector->base.dev;
> > + struct amdgpu_device *adev = drm_to_adev(dev);
> > + struct delayed_hpd_work *w;
> > +
> > + if (!aconnector || !aconnector->dc_link ||
> > + aconnector->dc_link->type == dc_connection_none)
> > + return;
> > +
> > + /* Don't retry polled connectors, the polling is going to detect it. */
> > + if (aconnector->base.polled != DRM_CONNECTOR_POLL_HPD)
> > + return;
> > +
> > + ++aconnector->num_hpd_retries;
> > +
> > + drm_dbg(dev, "Can't detect link on %s on try %d\n",
> > + aconnector->base.name, aconnector->num_hpd_retries);
> > +
> > + if (aconnector->num_hpd_retries >
> > AMDGPU_DM_HPD_MAX_NUM_RETRIES) {
> > + drm_warn(dev, "Too many retries on %s: %d, giving up\n",
> > + aconnector->base.name, aconnector-
> > >num_hpd_retries);
> > + aconnector->num_hpd_retries = 0;
> > + return;
> > + }
> > +
> > + w = kzalloc(sizeof(*w), GFP_ATOMIC);
> > +
> > + if (!w)
> > + return;
> > +
> > + INIT_DELAYED_WORK(&w->work, dm_handle_delayed_hpd_work);
> > + w->aconn = aconnector;
> > + w->reason = reason;
> > + aconnector->delayed_hpd_work = w;
> > +
> > + drm_warn(dev, "Enqueueing next retry on %s\n",
> > + aconnector->base.name);
> > + queue_delayed_work(adev->dm.delayed_hpd_wq, &w->work,
> > + msecs_to_jiffies(msecs));
> > +}
> > +
> > static const char *dmub_notification_type_str(enum
> > dmub_notification_type
> > e) {
> > switch (e) {
> > @@ -3249,6 +3379,7 @@ static int dm_hw_fini(struct amdgpu_ip_block
> > *ip_block) {
> > struct amdgpu_device *adev = ip_block->adev;
> >
> > + dm_cancel_all_delayed_hpd_work(&adev->ddev);
> > amdgpu_dm_hpd_fini(adev);
> >
> > amdgpu_dm_irq_fini(adev);
> > @@ -3422,6 +3553,8 @@ static int dm_suspend(struct amdgpu_ip_block
> > *ip_block)
> > struct amdgpu_device *adev = ip_block->adev;
> > struct amdgpu_display_manager *dm = &adev->dm;
> >
> > + dm_cancel_all_delayed_hpd_work(&adev->ddev);
> > +
> > if (amdgpu_in_reset(adev)) {
> > enum dc_status res;
> >
> > @@ -4451,6 +4584,9 @@ static void handle_hpd_irq_helper(struct
> > amdgpu_dm_connector *aconnector,
> > if (aconnector->base.force ==
> > DRM_FORCE_UNSPECIFIED ||
> > reason == DETECT_REASON_HPDRX)
> >
> > drm_kms_helper_connector_hotplug_event(connector);
> > + } else {
>
>
> We need to return here if aconnector->mst_mgr.mst_state is set.
>
>
> > + dm_queue_delayed_hpd_work(aconnector, reason,
> > +
> > AMDGPU_DM_HPD_RETRY_DELAY_MSEC);
> > }
> > }
> > }
> > @@ -4459,6 +4595,8 @@ static void handle_hpd_irq(void *param) {
> > struct amdgpu_dm_connector *aconnector = (struct
> amdgpu_dm_connector
> > *)param;
> >
> > + /* Cancel any pending work */
> > + dm_cancel_delayed_hpd_work(aconnector);
> > handle_hpd_irq_helper(aconnector, DETECT_REASON_HPD);
> >
> > }
> > diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
> > b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
> > index 7d37c1612131..9a66c9e2b78d 100644
> > --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
> > +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
> > @@ -136,6 +136,18 @@ struct dmub_hpd_work {
> > struct amdgpu_device *adev;
> > };
> >
> > +/**
> > + * struct delayed_hpd_work - Handle delayed HPD (hot plug detection)
> > +work
> > + *
> > + * @work: Base structure, kernel work data for the work event
> > + * @aconn: Pointer to connector where the HPD event happened */
> > +struct delayed_hpd_work {
> > + struct delayed_work work;
> > + struct amdgpu_dm_connector *aconn;
> > + enum dc_detect_reason reason;
> > +};
> > +
> > /**
> > * struct vblank_control_work - Work data for vblank control
> > * @work: Kernel work data for the work event @@ -801,6 +813,10 @@
> > struct amdgpu_dm_connector {
> > /* number of modes generated from EDID at 'dc_sink' */
> > int num_modes;
> >
> > + /* number of retries on hot plug detection */
> > + int num_hpd_retries;
> > + struct delayed_hpd_work *delayed_hpd_work;
> > +
> > /* The 'old' sink - before an HPD.
> > * The 'current' sink is in dc_link->sink. */
> > struct dc_sink *dc_sink;
> > --
> > 2.54.0