On 2026-05-01 16:35, Hamza Mahfooz wrote:
> DMU already has robust hung state tracking, but timeout recovery
> was never hooked up, so do so now.
> 
> Signed-off-by: Hamza Mahfooz <[email protected]>

Thanks for the patch Hamza, implementing a DMUB reset in 
dm_helpers_dmu_timeout() sounds like the right approach.
One comment inline.

> ---
>  .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 23 ++++++++++++++-----
>  .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h |  1 +
>  .../amd/display/amdgpu_dm/amdgpu_dm_helpers.c | 12 ++++++++--
>  3 files changed, 28 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> index e96a12ff2d31..7be4ebee1cb7 100644
> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> @@ -1246,7 +1246,7 @@ static  void amdgpu_dm_audio_eld_notify(struct 
> amdgpu_device *adev, int pin)
>       }
>  }
>  
> -static int dm_dmub_hw_init(struct amdgpu_device *adev)
> +int amdgpu_dm_dmub_hw_init(struct amdgpu_device *adev)
>  {
>       const struct dmcub_firmware_header_v1_0 *hdr;
>       struct dmub_srv *dmub_srv = adev->dm.dmub_srv;
> @@ -1315,7 +1315,7 @@ static int dm_dmub_hw_init(struct amdgpu_device *adev)
>       /* if adev->firmware.load_type == AMDGPU_FW_LOAD_PSP,
>        * amdgpu_ucode_init_single_fw will load dmub firmware
>        * fw_inst_const part to cw0; otherwise, the firmware back door load
> -      * will be done by dm_dmub_hw_init
> +      * will be done by amdgpu_dm_dmub_hw_init().
>        */
>       if (adev->firmware.load_type != AMDGPU_FW_LOAD_PSP) {
>               memcpy(fb_info->fb[DMUB_WINDOW_0_INST_CONST].cpu_addr, 
> fw_inst_const,
> @@ -1457,7 +1457,7 @@ static void dm_dmub_hw_resume(struct amdgpu_device 
> *adev)
>                       drm_warn(adev_to_drm(adev), "Wait for DMUB auto-load 
> failed: %d\n", status);
>       } else {
>               /* Perform the full hardware initialization. */
> -             r = dm_dmub_hw_init(adev);
> +             r = amdgpu_dm_dmub_hw_init(adev);
>               if (r)
>                       drm_err(adev_to_drm(adev), "DMUB interface failed to 
> initialize: status=%d\n", r);
>       }
> @@ -2041,6 +2041,9 @@ static int amdgpu_dm_init(struct amdgpu_device *adev)
>               goto error;
>       }
>  
> +     adev->dm.dc->debug.enable_dmu_recovery =
> +             amdgpu_device_should_recover_gpu(adev);
> +
>       if (amdgpu_dc_debug_mask & DC_DISABLE_PIPE_SPLIT) {
>               adev->dm.dc->debug.force_single_disp_pipe_split = false;
>               adev->dm.dc->debug.pipe_split_policy = MPC_SPLIT_AVOID;
> @@ -2090,7 +2093,7 @@ static int amdgpu_dm_init(struct amdgpu_device *adev)
>       if (adev->dm.dc->caps.dp_hdmi21_pcon_support)
>               drm_info(adev_to_drm(adev), "DP-HDMI FRL PCON supported\n");
>  
> -     r = dm_dmub_hw_init(adev);
> +     r = amdgpu_dm_dmub_hw_init(adev);
>       if (r) {
>               drm_err(adev_to_drm(adev), "DMUB interface failed to 
> initialize: status=%d\n", r);
>               goto error;
> @@ -3604,7 +3607,7 @@ static int dm_resume(struct amdgpu_ip_block *ip_block)
>                */
>               link_enc_cfg_copy(adev->dm.dc->current_state, dc_state);
>  
> -             r = dm_dmub_hw_init(adev);
> +             r = amdgpu_dm_dmub_hw_init(adev);
>               if (r) {
>                       drm_err(adev_to_drm(adev), "DMUB interface failed to 
> initialize: status=%d\n", r);
>                       return r;
> @@ -9623,7 +9626,15 @@ static void prepare_flip_isr(struct amdgpu_crtc *acrtc)
>  {
>  
>       assert_spin_locked(&acrtc->base.dev->event_lock);
> -     WARN_ON(acrtc->event);
> +
> +     /*
> +      * Compositors will refuse to make forward progress unless we send
> +      * the previous flip's completion event.
> +      */
> +     if (WARN_ON(acrtc->event)) {
> +             drm_crtc_send_vblank_event(&acrtc->base, acrtc->event);
> +             drm_crtc_vblank_put(&acrtc->base);
> +     }

I would expect this WARN_ON to occur only after the 10s flip_done timeout 
expires, allowing 'this' commit to progress with the previously armed 
acrtc->event and ->pflip_status from the previous commit ('this' commit would 
be gated by drm_atomic_helper_wait_for_dependencies).

In which case, we probably want to apply the same above hunk for the cursor 
path here and also raise a warning: 
https://elixir.bootlin.com/linux/v6.19.3/source/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c#L10170

- Leo

>  
>       acrtc->event = acrtc->base.state->event;
>  
> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h 
> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
> index 74a8fe1a1999..dc808ee83c2a 100644
> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
> @@ -1086,6 +1086,7 @@ int amdgpu_dm_verify_lut3d_size(struct amdgpu_device 
> *adev,
>  #define MAX_COLOR_LEGACY_LUT_ENTRIES 256
>  
>  void amdgpu_dm_init_color_mod(void);
> +int amdgpu_dm_dmub_hw_init(struct amdgpu_device *adev);
>  int amdgpu_dm_create_color_properties(struct amdgpu_device *adev);
>  int amdgpu_dm_verify_lut_sizes(const struct drm_crtc_state *crtc_state);
>  int amdgpu_dm_update_crtc_color_mgmt(struct dm_crtc_state *crtc);
> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c 
> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
> index 3b8ae7798a93..8f10117483e2 100644
> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
> @@ -33,6 +33,7 @@
>  #include <drm/drm_atomic.h>
>  #include <drm/drm_probe_helper.h>
>  #include <drm/amdgpu_drm.h>
> +#include <drm/drm_drv.h>
>  #include <drm/drm_edid.h>
>  #include <drm/drm_fixed.h>
>  
> @@ -1165,8 +1166,15 @@ void dm_set_dcn_clocks(struct dc_context *ctx, struct 
> dc_clocks *clks)
>  
>  void dm_helpers_dmu_timeout(struct dc_context *ctx)
>  {
> -     // TODO:
> -     //amdgpu_device_gpu_recover(dc_context->driver-context, NULL);
> +     struct amdgpu_device *adev = ctx->driver_context;
> +
> +     lockdep_assert_held(&adev->dm.dc_lock);
> +
> +     drm_info(adev_to_drm(adev), "attempting firmware reset\n");
> +     if (amdgpu_dm_dmub_hw_init(adev))
> +             drm_dev_wedged_event(adev_to_drm(adev),
> +                                  DRM_WEDGE_RECOVERY_REBIND |
> +                                  DRM_WEDGE_RECOVERY_BUS_RESET, NULL);
>  }
>  
>  void dm_helpers_smu_timeout(struct dc_context *ctx, unsigned int msg_id, 
> unsigned int param, unsigned int timeout_us)

Reply via email to