AMD General

Hi Timur:

     The new link detection delay workqueue causes hard hang on Dell 6K DP2 
monitor after a couple of hotplug. Please refer to below dmesg as reference.

[  182.455489] watchdog: CPU1: Watchdog detected hard LOCKUP on cpu 1
[  182.455490] Modules linked in: snd_seq_dummy snd_hrtimer qrtr intel_rapl_msr 
amd_atl intel_rapl_common binfmt_misc edac_mce_amd kvm_amd nls_iso8859_1 
snd_hda_codec_atihdmi snd_hda_codec_hdmi kvm amdgpu snd_hda_intel rapl 
eeepc_wmi amdxcp snd_hda_codec asus_wmi drm_panel_backlight_quirks 
sparse_keymap platform_profile wmi_bmof snd_usb_audio snd_usbmidi_lib 
snd_hda_core gpu_sched drm_buddy mc snd_intel_dspcfg drm_ttm_helper 
snd_intel_sdw_acpi ttm drm_exec snd_seq_midi snd_seq_midi_event 
drm_suballoc_helper snd_hwdep drm_client_lib snd_pcm snd_rawmidi 
drm_display_helper i2c_piix4 k10temp i2c_smbus snd_seq drm_kms_helper r8169 
snd_seq_device cec snd_timer snd rc_core i2c_algo_bit soundcore realtek 
input_leds joydev gpio_amdpt gpio_generic acpi_pad mac_hid sch_fq_codel drm 
efi_pstore nfnetlink hid_generic ucsi_acpi typec_ucsi typec usbhid hid 
ghash_clmulni_intel ccp ahci libahci video thunderbolt wmi parport_pc lp ppdev 
msr parport dmi_sysfs autofs4 aesni_intel
[  182.455511] CPU: 1 UID: 0 PID: 262 Comm: kworker/1:1H Kdump: loaded Not 
tainted 6.19.0-promotion-june2-patchy3+ #319 PREEMPT(voluntary)
[  182.455513] Hardware name: ASUS System Product Name/ROG STRIX X870E-E GAMING 
WIFI, BIOS 2202 04/09/2026
[  182.455514] Workqueue: events_highpri dm_irq_work_func [amdgpu]
[  182.455691] RIP: 0010:native_queued_spin_lock_slowpath+0x213/0x2c0
[  182.455695] Code: 8d 46 01 41 c1 e5 10 c1 e0 12 41 09 c5 44 89 e8 c1 e8 10 
66 87 43 02 89 c2 c1 e2 10 81 fa ff ff 00 00 77 51 31 d2 eb 02 f3 90 <8b> 03 66 
85 c0 75 f7 44 39 e8 0f 84 85 00 00 00 c6 03 01 48 85 d2
[  182.455696] RSP: 0018:ffffd17bc08e3ce0 EFLAGS: 00000006
[  182.455697] RAX: 0000000000081000 RBX: ffffffff8d4176ff RCX: 0000000000000000
[  182.455698] RDX: 0000000000000000 RSI: ffffffff8ca9f20c RDI: ffffffff8ca88b37
[  182.455698] RBP: ffffd17bc08e3d00 R08: 8080808080808080 R09: ffff8a5c8186b100
[  182.455698] R10: ffff8a5c80052ec0 R11: fefefefefefefeff R12: ffff8a63ac2731c0
[  182.455699] R13: 0000000000080000 R14: 0000000000000001 R15: 0000000000000000
[  182.455699] FS:  0000000000000000(0000) GS:ffff8a641ee49000(0000) 
knlGS:0000000000000000
[  182.455700] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  182.455700] CR2: 0000562bd6ca9000 CR3: 0000000200436000 CR4: 0000000000750ef0
[  182.455701] PKRU: 55555554
[  182.455701] Call Trace:
[  182.455701]  <TASK>
[  182.455702]  _raw_spin_lock_irqsave+0x4b/0x60
[  182.455704]  lock_timer_base+0x73/0xa0
[  182.455707]  timer_delete+0x36/0x80
[  182.455708]  try_to_grab_pending+0x121/0x1d0
[  182.455710]  __cancel_work+0x3b/0x100
[  182.455711]  ? finish_task_switch.isra.0+0x92/0x280
[  182.455713]  cancel_delayed_work+0x13/0x20
[  182.455714]  handle_hpd_irq+0x23/0x50 [amdgpu]
[  182.455859]  dm_irq_work_func+0x19/0x20 [amdgpu]
[  182.455991]  process_one_work+0x18f/0x3d0
[  182.455992]  worker_thread+0x2cf/0x410
[  182.455993]  ? _raw_spin_unlock_irqrestore+0x27/0x50
[  182.455994]  ? __pfx_worker_thread+0x10/0x10
[  182.455995]  kthread+0x11c/0x230
[  182.455997]  ? _raw_spin_unlock_irq+0x1f/0x40
[  182.455998]  ? __pfx_kthread+0x10/0x10
[  182.455999]  ret_from_fork+0x195/0x210
[  182.456001]  ? __pfx_kthread+0x10/0x10
[  182.456002]  ret_from_fork_asm+0x1a/0x30
[  182.456004]  </TASK>
[  185.660573] amdgpu 0000:03:00.0: [drm] *ERROR* [CRTC:423:crtc-0] flip_done 
timed out
[  185.660587] amdgpu 0000:03:00.0: [drm:drm_atomic_state_default_clear [drm]] 
Clearing atomic state 00000000f6b354ad
[  185.660622] amdgpu 0000:03:00.0: [drm:__drm_atomic_state_free [drm]] Freeing 
atomic state 00000000f6b354ad
[  217.116563] rcu: INFO: rcu_preempt detected stalls on CPUs/tasks:
[  217.116569] rcu:     1-...0: (9 GPs behind) idle=af8c/1/0x4000000000000000 
softirq=14209/14209 fqs=5572
[  217.116575] rcu:     (detected by 13, t=15002 jiffies, g=21165, q=9426 
ncpus=16)
[  217.116578] Sending NMI from CPU 13 to CPUs 1:
[  217.116580] NMI backtrace for cpu 1
[  217.116581] CPU: 1 UID: 0 PID: 262 Comm: kworker/1:1H Kdump: loaded Not 
tainted 6.19.0-promotion-june2-patchy3+ #319 PREEMPT(voluntary)
[  217.116582] Hardware name: ASUS System Product Name/ROG STRIX X870E-E GAMING 
WIFI, BIOS 2202 04/09/2026
[  217.116582] Workqueue: events_highpri dm_irq_work_func [amdgpu]
[  217.116721] RIP: 0010:native_queued_spin_lock_slowpath+0x213/0x2c0
[  217.116723] Code: 8d 46 01 41 c1 e5 10 c1 e0 12 41 09 c5 44 89 e8 c1 e8 10 
66 87 43 02 89 c2 c1 e2 10 81 fa ff ff 00 00 77 51 31 d2 eb 02 f3 90 <8b> 03 66 
85 c0 75 f7 44 39 e8 0f 84 85 00 00 00 c6 03 01 48 85 d2
[  217.116723] RSP: 0018:ffffd17bc08e3ce0 EFLAGS: 00000006
[  217.116724] RAX: 0000000000081000 RBX: ffffffff8d4176ff RCX: 0000000000000000
[  217.116724] RDX: 0000000000000000 RSI: ffffffff8ca9f20c RDI: ffffffff8ca88b37
[  217.116725] RBP: ffffd17bc08e3d00 R08: 8080808080808080 R09: ffff8a5c8186b100
[  217.116725] R10: ffff8a5c80052ec0 R11: fefefefefefefeff R12: ffff8a63ac2731c0
[  217.116725] R13: 0000000000080000 R14: 0000000000000001 R15: 0000000000000000
[  217.116725] FS:  0000000000000000(0000) GS:ffff8a641ee49000(0000) 
knlGS:0000000000000000
[  217.116726] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  217.116726] CR2: 0000562bd6ca9000 CR3: 0000000200436000 CR4: 0000000000750ef0
[  217.116727] PKRU: 55555554
[  217.116727] Call Trace:
[  217.116727]  <TASK>
[  217.116727]  _raw_spin_lock_irqsave+0x4b/0x60
[  217.116728]  lock_timer_base+0x73/0xa0
[  217.116730]  timer_delete+0x36/0x80
[  217.116730]  try_to_grab_pending+0x121/0x1d0
[  217.116731]  __cancel_work+0x3b/0x100
[  217.116732]  ? finish_task_switch.isra.0+0x92/0x280
[  217.116733]  cancel_delayed_work+0x13/0x20
[  217.116734]  handle_hpd_irq+0x23/0x50 [amdgpu]
[  217.116866]  dm_irq_work_func+0x19/0x20 [amdgpu]
[  217.116992]  process_one_work+0x18f/0x3d0
[  217.116993]  worker_thread+0x2cf/0x410
[  217.116994]  ? _raw_spin_unlock_irqrestore+0x27/0x50
[  217.116995]  ? __pfx_worker_thread+0x10/0x10
[  217.116996]  kthread+0x11c/0x230
[  217.116997]  ? _raw_spin_unlock_irq+0x1f/0x40
[  217.116998]  ? __pfx_kthread+0x10/0x10
[  217.116999]  ret_from_fork+0x195/0x210
[  217.117000]  ? __pfx_kthread+0x10/0x10
[  217.117001]  ret_from_fork_asm+0x1a/0x30
[  217.117002]  </TASK>
[  244.249406] watchdog: BUG: soft lockup - CPU#3 stuck for 22s! 
[kworker/3:1:161]

Regards,
Jerry

> -----Original Message-----
> From: Zuo, Jerry
> Sent: Friday, June 5, 2026 12:10
> To: Aurabindo Pillai <[email protected]>; amd-
> [email protected]
> Cc: Wentland, Harry <[email protected]>; Li, Sun peng (Leo)
> <[email protected]>; Pillai, Aurabindo <[email protected]>; Li,
> Roman <[email protected]>; Lin, Wayne <[email protected]>; Chung,
> ChiaHsuan (Tom) <[email protected]>; Wheeler, Daniel
> <[email protected]>; Wu, Ray <[email protected]>; LIPSKI, IVAN
> <[email protected]>; Hung, Alex <[email protected]>; Lin, Ping Lei
> <[email protected]>; Chen, Chen-Yu <[email protected]>; Timur
> Kristóf <[email protected]>
> Subject: RE: [PATCH 21/24] drm/amd/display: Retry link detection on hotplug
>
> Hi Timur:
>
>      Please let me know whether you have validated the sequence by any SST
> and HDMI monitor. Basically it is to confirm the link retry workqueue is
> getting executed with hotplug and dpms use cases without introducing side
> effect. We've tested locally, but we don't see any link retry workqueue is
> getting executed by either SST or HDMI monitors. That makes us hard to
> validate the new error handling logic. It would be perfect that if you have
> done that at your local setup to confirm the new logic is regression-free for
> SST and HDMI.
>
>      Apart from that, we find a regression in MST. MST has its own detection
> logic and should be separated from SST and HDMI.
>
> Regards,
> Jerry
>
> > -----Original Message-----
> > From: Aurabindo Pillai <[email protected]>
> > Sent: Thursday, June 4, 2026 10:52
> > To: [email protected]
> > Cc: Wentland, Harry <[email protected]>; Li, Sun peng (Leo)
> > <[email protected]>; Pillai, Aurabindo <[email protected]>;
> > Li, Roman <[email protected]>; Lin, Wayne <[email protected]>;
> Chung,
> > ChiaHsuan (Tom) <[email protected]>; Zuo, Jerry
> > <[email protected]>; Wheeler, Daniel <[email protected]>; Wu,
> Ray
> > <[email protected]>; LIPSKI, IVAN <[email protected]>; Hung, Alex
> > <[email protected]>; Lin, Ping Lei <[email protected]>; Chen, Chen-
> > Yu <[email protected]>; Timur Kristóf <[email protected]>
> > Subject: [PATCH 21/24] drm/amd/display: Retry link detection on
> > hotplug
> >
> > From: Timur Kristóf <[email protected]>
> >
> > When dc_link_detect_connection_type thinks that a display is
> > connected, but dc_link_detect failed, enqueue delayed work to retry the
> link detection again.
> >
> > Useful when eg. HPD pin is high but the display isn't ready and didn't
> > respond to DDC.
> >
> > - The display is "slow to wake up", ie. DDC isn't ready,
> >   for example we couldn't read EDID. Can happen with any
> >   connector type with certain "slow" displays.
> >   Some displays may take up to 15~20 sec or more to wake up.
> >
> > - On hotplug, the HPD pin may make contact before the DDC pins,
> >   so we couldn't read the EDID. This most often happens with
> >   DVI connectors, rarely with HDMI. It is not impossible but
> >   extremely rare with other connector types.
> >
> > Signed-off-by: Timur Kristóf <[email protected]>
> > Signed-off-by: Aurabindo Pillai <[email protected]>
> > Reviewed-by: Alex Hung <[email protected]>
> > ---
> >  .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 138
> >
> ++++++++++++++++++  .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
> > |  16 ++
> >  2 files changed, 154 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > index d1b1eb67d937..40295a5edbec 100644
> > --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > @@ -161,6 +161,17 @@
> MODULE_FIRMWARE(FIRMWARE_DCN_42_DMUB);
> >  #define FIRMWARE_DCN_42B_DMUB "amdgpu/dcn_4_2_1_dmcub.bin"
> >  MODULE_FIRMWARE(FIRMWARE_DCN_42B_DMUB);
> >
> > +/**
> > + * define AMDGPU_DM_HPD_MAX_NUM_RETRIES - maximum amount of
> > retries for
> > +hotplug detection  */ #define AMDGPU_DM_HPD_MAX_NUM_RETRIES 5
> > +
> > +/**
> > + * define AMDGPU_DM_HPD_RETRY_DELAY_MSEC - millisecond delay
> > between
> > +hotplug detection retries  */ #define
> > AMDGPU_DM_HPD_RETRY_DELAY_MSEC
> > +1500
> > +
> > +
> >  /**
> >   * DOC: overview
> >   *
> > @@ -959,6 +970,125 @@ static void dm_handle_hpd_work(struct
> > work_struct *work)
> >
> >  }
> >
> > +/**
> > + * dm_handle_delayed_hpd_work() - Handle delayed HPD (hotplug
> > +detection)
> > + *
> > + * @w: Base work item structure
> > + *
> > + * Used for retrying HPD after a delay. Just calls the normal HPD helper.
> > + */
> > +static void dm_handle_delayed_hpd_work(struct work_struct *work) {
> > +   struct delayed_work *dw = container_of(work, struct delayed_work,
> > work);
> > +   struct delayed_hpd_work *w = container_of(dw, struct
> > delayed_hpd_work, work);
> > +   struct amdgpu_dm_connector *aconn = w->aconn;
> > +   enum dc_detect_reason reason = w->reason;
> > +
> > +   kfree(w);
> > +   handle_hpd_irq_helper(aconn, reason); }
> > +
> > +/**
> > + * dm_cancel_delayed_hpd_work() - Cancel pending hotplug detection
> > +work for a connector
> > + *
> > + * @aconnector: Connector on which the HPD event occurred  */ static
> > +void dm_cancel_delayed_hpd_work(struct amdgpu_dm_connector
> > *aconnector)
> > +{
> > +   if (!aconnector || !aconnector->delayed_hpd_work)
> > +           return;
> > +
> > +   cancel_delayed_work(&aconnector->delayed_hpd_work->work);
> > +   aconnector->delayed_hpd_work = NULL; }
> > +
> > +/**
> > + * dm_cancel_all_delayed_hpd_work() - Cancel all pending hotplug
> > +detection work on the device
> > + *
> > + * @dev: DRM device pointer
> > + */
> > +static void dm_cancel_all_delayed_hpd_work(struct drm_device *dev) {
> > +   struct drm_connector *connector;
> > +   struct drm_connector_list_iter iter;
> > +
> > +   drm_connector_list_iter_begin(dev, &iter);
> > +   drm_for_each_connector_iter(connector, &iter) {
> > +           if (connector->connector_type ==
> > DRM_MODE_CONNECTOR_WRITEBACK)
> > +                   continue;
> > +
> > +
> >     dm_cancel_delayed_hpd_work(to_amdgpu_dm_connector(connecto
> > r));
> > +   }
> > +   drm_connector_list_iter_end(&iter);
> > +}
> > +
> > +/**
> > + * dm_queue_delayed_hpd_work() - Enqueue delayed work to handle
> > hotplug
> > +detection
> > + *
> > + * @aconnector: Connector on which the HPD event occurred
> > + * @reason: Reason why we are attempting the HPD
> > + * @msecs: Millisecond delay after which the delayed work is going to
> > +happen
> > + *
> > + * When dc_link_detect_connection_type thinks that a display is
> > +connected,
> > + * but dc_link_detect failed, enqueue delayed work to retry the link
> > + * detection again.
> > + *
> > + * Useful when eg. HPD pin is high but the display isn't ready and
> > + * didn't respond to DDC.
> > + *
> > + * - On boot or suspend/resume, the display is "slow to wake up",
> > + *   ie. DDC isn't ready, for example we couldn't read DP link caps or 
> > EDID.
> > + *   Can happen to any connector with certain "slow" displays.
> > + *
> > + * - On hotplug, the HPD pin may make contact before the DDC pins,
> > + *   so we couldn't read the EDID. Can happen to any connector but
> > + *   most often to DVI and sometimes to HDMI (rarely to DP).
> > + *
> > + */
> > +static void dm_queue_delayed_hpd_work(struct amdgpu_dm_connector
> > *aconnector,
> > +                                 const enum dc_detect_reason reason,
> > +                                 const unsigned int msecs)
> > +{
> > +   struct drm_device *dev = aconnector->base.dev;
> > +   struct amdgpu_device *adev = drm_to_adev(dev);
> > +   struct delayed_hpd_work *w;
> > +
> > +   if (!aconnector || !aconnector->dc_link ||
> > +       aconnector->dc_link->type == dc_connection_none)
> > +           return;
> > +
> > +   /* Don't retry polled connectors, the polling is going to detect it. */
> > +   if (aconnector->base.polled != DRM_CONNECTOR_POLL_HPD)
> > +           return;
> > +
> > +   ++aconnector->num_hpd_retries;
> > +
> > +   drm_dbg(dev, "Can't detect link on %s on try %d\n",
> > +           aconnector->base.name, aconnector->num_hpd_retries);
> > +
> > +   if (aconnector->num_hpd_retries >
> > AMDGPU_DM_HPD_MAX_NUM_RETRIES) {
> > +           drm_warn(dev, "Too many retries on %s: %d, giving up\n",
> > +                    aconnector->base.name, aconnector-
> > >num_hpd_retries);
> > +           aconnector->num_hpd_retries = 0;
> > +           return;
> > +   }
> > +
> > +   w = kzalloc(sizeof(*w), GFP_ATOMIC);
> > +
> > +   if (!w)
> > +           return;
> > +
> > +   INIT_DELAYED_WORK(&w->work, dm_handle_delayed_hpd_work);
> > +   w->aconn = aconnector;
> > +   w->reason = reason;
> > +   aconnector->delayed_hpd_work = w;
> > +
> > +   drm_warn(dev, "Enqueueing next retry on %s\n",
> > +            aconnector->base.name);
> > +   queue_delayed_work(adev->dm.delayed_hpd_wq, &w->work,
> > +                      msecs_to_jiffies(msecs));
> > +}
> > +
> >  static const char *dmub_notification_type_str(enum
> > dmub_notification_type
> > e)  {
> >     switch (e) {
> > @@ -3249,6 +3379,7 @@ static int dm_hw_fini(struct amdgpu_ip_block
> > *ip_block)  {
> >     struct amdgpu_device *adev = ip_block->adev;
> >
> > +   dm_cancel_all_delayed_hpd_work(&adev->ddev);
> >     amdgpu_dm_hpd_fini(adev);
> >
> >     amdgpu_dm_irq_fini(adev);
> > @@ -3422,6 +3553,8 @@ static int dm_suspend(struct amdgpu_ip_block
> > *ip_block)
> >     struct amdgpu_device *adev = ip_block->adev;
> >     struct amdgpu_display_manager *dm = &adev->dm;
> >
> > +   dm_cancel_all_delayed_hpd_work(&adev->ddev);
> > +
> >     if (amdgpu_in_reset(adev)) {
> >             enum dc_status res;
> >
> > @@ -4451,6 +4584,9 @@ static void handle_hpd_irq_helper(struct
> > amdgpu_dm_connector *aconnector,
> >                     if (aconnector->base.force ==
> > DRM_FORCE_UNSPECIFIED ||
> >                         reason == DETECT_REASON_HPDRX)
> >
> >     drm_kms_helper_connector_hotplug_event(connector);
> > +           } else {
>
>
> We need to return here if aconnector->mst_mgr.mst_state is set.
>
>
> > +                   dm_queue_delayed_hpd_work(aconnector, reason,
> > +
> > AMDGPU_DM_HPD_RETRY_DELAY_MSEC);
> >             }
> >     }
> >  }
> > @@ -4459,6 +4595,8 @@ static void handle_hpd_irq(void *param)  {
> >     struct amdgpu_dm_connector *aconnector = (struct
> amdgpu_dm_connector
> > *)param;
> >
> > +   /* Cancel any pending work */
> > +   dm_cancel_delayed_hpd_work(aconnector);
> >     handle_hpd_irq_helper(aconnector, DETECT_REASON_HPD);
> >
> >  }
> > diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
> > b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
> > index 7d37c1612131..9a66c9e2b78d 100644
> > --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
> > +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
> > @@ -136,6 +136,18 @@ struct dmub_hpd_work {
> >     struct amdgpu_device *adev;
> >  };
> >
> > +/**
> > + * struct delayed_hpd_work - Handle delayed HPD (hot plug detection)
> > +work
> > + *
> > + * @work: Base structure, kernel work data for the work event
> > + * @aconn: Pointer to connector where the HPD event happened  */
> > +struct delayed_hpd_work {
> > +   struct delayed_work work;
> > +   struct amdgpu_dm_connector *aconn;
> > +   enum dc_detect_reason reason;
> > +};
> > +
> >  /**
> >   * struct vblank_control_work - Work data for vblank control
> >   * @work: Kernel work data for the work event @@ -801,6 +813,10 @@
> > struct amdgpu_dm_connector {
> >     /* number of modes generated from EDID at 'dc_sink' */
> >     int num_modes;
> >
> > +   /* number of retries on hot plug detection */
> > +   int num_hpd_retries;
> > +   struct delayed_hpd_work *delayed_hpd_work;
> > +
> >     /* The 'old' sink - before an HPD.
> >      * The 'current' sink is in dc_link->sink. */
> >     struct dc_sink *dc_sink;
> > --
> > 2.54.0

Reply via email to