[AMD Official Use Only - AMD Internal Distribution Only] ok
-----Original Message----- From: Lazar, Lijo <[email protected]> Sent: Tuesday, November 4, 2025 3:24 PM To: Chai, Thomas <[email protected]>; [email protected] Cc: Zhang, Hawking <[email protected]>; Zhou1, Tao <[email protected]>; Li, Candice <[email protected]>; Yang, Stanley <[email protected]>; Su, Joe <[email protected]> Subject: Re: [PATCH] drm/amdgpu: suspend ras module before gpu reset On 11/3/2025 1:57 PM, YiPeng Chai wrote: > During gpu reset, all GPU-related resources are inaccessible. To avoid > affecting ras functionality, suspend ras module before gpu reset and > resume it after gpu reset is complete. > > V2: > Rename functions to avoid misunderstanding. > > V3: > Move flush_delayed_work to amdgpu_ras_process_pause, > Move schedule_delayed_work to amdgpu_ras_process_unpause. > > V4: > Rename functions. > > Signed-off-by: YiPeng Chai <[email protected]> > Reviewed-by: Tao Zhou <[email protected]> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 26 ++++++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 8 ++- > .../gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c | 22 +++++++ > .../gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h | 5 ++ > .../drm/amd/ras/ras_mgr/amdgpu_ras_process.c | 64 +++++++++++++++++++ > .../drm/amd/ras/ras_mgr/amdgpu_ras_process.h | 4 ++ > .../gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c | 6 ++ > drivers/gpu/drm/amd/ras/rascore/ras.h | 2 + > drivers/gpu/drm/amd/ras/rascore/ras_process.c | 7 ++ > 9 files changed, 142 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index cc6e59208cac..9e8802ccc75e 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -71,6 +71,7 @@ > > #include "amdgpu_xgmi.h" > #include "amdgpu_ras.h" > +#include "amdgpu_ras_mgr.h" > #include "amdgpu_pmu.h" > #include "amdgpu_fru_eeprom.h" > #include "amdgpu_reset.h" > @@ -6586,6 +6587,27 @@ static void amdgpu_device_gpu_resume(struct > amdgpu_device *adev, > } > } > > +static void amdgpu_ras_pre_reset(struct amdgpu_device *adev, > + struct list_head *device_list) { > + struct amdgpu_device *tmp_adev = NULL; > + > + list_for_each_entry(tmp_adev, device_list, reset_list) { > + if (amdgpu_uniras_enabled(tmp_adev)) > + amdgpu_ras_mgr_pre_reset(tmp_adev); > + } > +} > + > +static void amdgpu_ras_post_reset(struct amdgpu_device *adev, > + struct list_head *device_list) { > + struct amdgpu_device *tmp_adev = NULL; > + > + list_for_each_entry(tmp_adev, device_list, reset_list) { > + if (amdgpu_uniras_enabled(tmp_adev)) > + amdgpu_ras_mgr_post_reset(tmp_adev); > + } > +} > One nit - could you move above ones to amdgpu_ras.c or ras core related file? Thanks, Lijo > /** > * amdgpu_device_gpu_recover - reset the asic and recover scheduler > @@ -6660,6 +6682,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device > *adev, > goto end_reset; > } > > + /* Cannot be called after locking reset domain */ > + amdgpu_ras_pre_reset(adev, &device_list); > + > /* We need to lock reset domain only once both for XGMI and single > device */ > amdgpu_device_recovery_get_reset_lock(adev, &device_list); > > @@ -6691,6 +6716,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device > *adev, > reset_unlock: > amdgpu_device_recovery_put_reset_lock(adev, &device_list); > end_reset: > + amdgpu_ras_post_reset(adev, &device_list); > if (hive) { > mutex_unlock(&hive->hive_lock); > amdgpu_put_xgmi_hive(hive); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 5377cde0c55d..cb4f4b5668ab 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -2921,8 +2921,12 @@ static void amdgpu_ras_do_recovery(struct work_struct > *work) > type = amdgpu_ras_get_fatal_error_event(adev); > list_for_each_entry(remote_adev, > device_list_handle, gmc.xgmi.head) { > - amdgpu_ras_query_err_status(remote_adev); > - amdgpu_ras_log_on_err_counter(remote_adev, type); > + if (amdgpu_uniras_enabled(remote_adev)) { > + amdgpu_ras_mgr_update_ras_ecc(remote_adev); > + } else { > + amdgpu_ras_query_err_status(remote_adev); > + amdgpu_ras_log_on_err_counter(remote_adev, > type); > + } > } > > } > diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c > b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c > index f8ec0f26a9e7..36c665c3ee48 100644 > --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c > +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c > @@ -622,3 +622,25 @@ int amdgpu_ras_mgr_handle_ras_cmd(struct > amdgpu_device *adev, > > return ret; > } > + > +int amdgpu_ras_mgr_pre_reset(struct amdgpu_device *adev) { > + if (!amdgpu_ras_mgr_is_ready(adev)) { > + RAS_DEV_ERR(adev, "Invalid ras suspend!\n"); > + return -EPERM; > + } > + > + amdgpu_ras_process_pre_reset(adev); > + return 0; > +} > + > +int amdgpu_ras_mgr_post_reset(struct amdgpu_device *adev) { > + if (!amdgpu_ras_mgr_is_ready(adev)) { > + RAS_DEV_ERR(adev, "Invalid ras resume!\n"); > + return -EPERM; > + } > + > + amdgpu_ras_process_post_reset(adev); > + return 0; > +} > diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h > b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h > index 42f190a8feb9..8fb7eb4b8f13 100644 > --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h > +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h > @@ -52,6 +52,9 @@ struct amdgpu_ras_mgr { > struct ras_event_manager ras_event_mgr; > uint64_t last_poison_consumption_seqno; > bool ras_is_ready; > + > + bool is_paused; > + struct completion ras_event_done; > }; > > extern const struct amdgpu_ip_block_version ras_v1_0_ip_block; @@ > -75,4 +78,6 @@ bool amdgpu_ras_mgr_is_rma(struct amdgpu_device *adev); > int amdgpu_ras_mgr_handle_ras_cmd(struct amdgpu_device *adev, > uint32_t cmd_id, void *input, uint32_t input_size, > void *output, uint32_t out_size); > +int amdgpu_ras_mgr_pre_reset(struct amdgpu_device *adev); int > +amdgpu_ras_mgr_post_reset(struct amdgpu_device *adev); > #endif > diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.c > b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.c > index 6727fc9a2b9b..5782c007de71 100644 > --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.c > +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.c > @@ -29,6 +29,7 @@ > #include "amdgpu_ras_process.h" > > #define RAS_MGR_RETIRE_PAGE_INTERVAL 100 > +#define RAS_EVENT_PROCESS_TIMEOUT 1200 > > static void ras_process_retire_page_dwork(struct work_struct *work) > { > @@ -57,6 +58,9 @@ int amdgpu_ras_process_init(struct amdgpu_device *adev) > { > struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); > > + ras_mgr->is_paused = false; > + init_completion(&ras_mgr->ras_event_done); > + > INIT_DELAYED_WORK(&ras_mgr->retire_page_dwork, > ras_process_retire_page_dwork); > > return 0; > @@ -66,6 +70,7 @@ int amdgpu_ras_process_fini(struct amdgpu_device *adev) > { > struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); > > + ras_mgr->is_paused = false; > /* Save all cached bad pages to eeprom */ > flush_delayed_work(&ras_mgr->retire_page_dwork); > cancel_delayed_work_sync(&ras_mgr->retire_page_dwork); > @@ -124,3 +129,62 @@ int > amdgpu_ras_process_handle_consumption_interrupt(struct amdgpu_device > *adev, > > return ras_process_add_interrupt_req(ras_mgr->ras_core, &req, false); > } > + > +int amdgpu_ras_process_begin(struct amdgpu_device *adev) { > + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); > + > + if (ras_mgr->is_paused) > + return -EAGAIN; > + > + reinit_completion(&ras_mgr->ras_event_done); > + return 0; > +} > + > +int amdgpu_ras_process_end(struct amdgpu_device *adev) { > + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); > + > + complete(&ras_mgr->ras_event_done); > + return 0; > +} > + > +int amdgpu_ras_process_pre_reset(struct amdgpu_device *adev) { > + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); > + long rc; > + > + if (!ras_mgr || !ras_mgr->ras_core) > + return -EINVAL; > + > + if (!ras_mgr->ras_core->is_initialized) > + return -EPERM; > + > + ras_mgr->is_paused = true; > + > + /* Wait for RAS event processing to complete */ > + rc = wait_for_completion_interruptible_timeout(&ras_mgr->ras_event_done, > + msecs_to_jiffies(RAS_EVENT_PROCESS_TIMEOUT)); > + if (rc <= 0) > + RAS_DEV_WARN(adev, "Waiting for ras process to complete %s\n", > + rc ? "interrupted" : "timeout"); > + > + flush_delayed_work(&ras_mgr->retire_page_dwork); > + return 0; > +} > + > +int amdgpu_ras_process_post_reset(struct amdgpu_device *adev) { > + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); > + > + if (!ras_mgr || !ras_mgr->ras_core) > + return -EINVAL; > + > + if (!ras_mgr->ras_core->is_initialized) > + return -EPERM; > + > + ras_mgr->is_paused = false; > + > + schedule_delayed_work(&ras_mgr->retire_page_dwork, 0); > + return 0; > +} > diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.h > b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.h > index b9502bd21beb..d55cdaeac441 100644 > --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.h > +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.h > @@ -34,4 +34,8 @@ int amdgpu_ras_process_handle_unexpected_interrupt(struct > amdgpu_device *adev, > void *data); > int amdgpu_ras_process_handle_consumption_interrupt(struct amdgpu_device > *adev, > void *data); > +int amdgpu_ras_process_begin(struct amdgpu_device *adev); int > +amdgpu_ras_process_end(struct amdgpu_device *adev); int > +amdgpu_ras_process_pre_reset(struct amdgpu_device *adev); int > +amdgpu_ras_process_post_reset(struct amdgpu_device *adev); > #endif > diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c > b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c > index f21cd55a25be..45ed8c3b5563 100644 > --- a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c > +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c > @@ -142,6 +142,12 @@ static int amdgpu_ras_sys_event_notifier(struct > ras_core_context *ras_core, > case RAS_EVENT_ID__RESET_GPU: > ret = amdgpu_ras_mgr_reset_gpu(ras_core->dev, *(uint32_t > *)data); > break; > + case RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN: > + ret = amdgpu_ras_process_begin(ras_core->dev); > + break; > + case RAS_EVENT_ID__RAS_EVENT_PROC_END: > + ret = amdgpu_ras_process_end(ras_core->dev); > + break; > default: > RAS_DEV_WARN(ras_core->dev, "Invalid ras notify event:%d\n", > event_id); > break; > diff --git a/drivers/gpu/drm/amd/ras/rascore/ras.h > b/drivers/gpu/drm/amd/ras/rascore/ras.h > index fa224b36e3f2..3396b2e0949d 100644 > --- a/drivers/gpu/drm/amd/ras/rascore/ras.h > +++ b/drivers/gpu/drm/amd/ras/rascore/ras.h > @@ -115,6 +115,8 @@ enum ras_notify_event { > RAS_EVENT_ID__FATAL_ERROR_DETECTED, > RAS_EVENT_ID__RESET_GPU, > RAS_EVENT_ID__RESET_VF, > + RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN, > + RAS_EVENT_ID__RAS_EVENT_PROC_END, > }; > > enum ras_gpu_status { > diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_process.c > b/drivers/gpu/drm/amd/ras/rascore/ras_process.c > index 02f0657f78a3..3267dcdb169c 100644 > --- a/drivers/gpu/drm/amd/ras/rascore/ras_process.c > +++ b/drivers/gpu/drm/amd/ras/rascore/ras_process.c > @@ -162,6 +162,11 @@ int ras_process_handle_ras_event(struct ras_core_context > *ras_core) > uint32_t umc_event_count; > int ret; > > + ret = ras_core_event_notify(ras_core, > + RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN, NULL); > + if (ret) > + return ret; > + > ras_aca_clear_fatal_flag(ras_core); > ras_umc_log_pending_bad_bank(ras_core); > > @@ -185,6 +190,8 @@ int ras_process_handle_ras_event(struct ras_core_context > *ras_core) > atomic_set(&ras_proc->umc_interrupt_count, 0); > } > > + ras_core_event_notify(ras_core, > + RAS_EVENT_ID__RAS_EVENT_PROC_END, NULL); > return ret; > } >
