[Public] Presently, there is this one also - drm_dev_wedged_event. Perhaps it's better to modify this to include additional info like pre and post reset along with cause of reset?
Thanks, Lijo -----Original Message----- From: amd-gfx <[email protected]> On Behalf Of Yang Wang Sent: Friday, September 26, 2025 12:04 PM To: [email protected] Cc: Zhang, Hawking <[email protected]>; Deucher, Alexander <[email protected]> Subject: [PATCH] drm/amdgpu: notify amdgpu gpu reset state via uevent Use the uevent mechanism to expose the GPU reset state, so that the system tool can more accurately monitor the device reset status. example: $ sudo cat /sys/kernel/debug/dri/<minor>/amdgpu_gpu_recover KERNEL[172.053149] change /devices/pci0000:00/0000:00:03.1/0000:03:00.0/0000:04:00.0/0000:05:00.0 (pci) ACTION=change DEVPATH=/devices/pci0000:00/0000:00:03.1/0000:03:00.0/0000:04:00.0/0000:05:00.0 SUBSYSTEM=pci RESET=1 DRIVER=amdgpu PCI_CLASS=30000 PCI_ID=1002:73BF PCI_SUBSYS_ID=1002:0E3A PCI_SLOT_NAME=0000:05:00.0 MODALIAS=pci:v00001002d000073BFsv00001002sd00000E3Abc03sc00i00 SEQNUM=6235 KERNEL[173.137681] change /devices/pci0000:00/0000:00:03.1/0000:03:00.0/0000:04:00.0/0000:05:00.0 (pci) ACTION=change DEVPATH=/devices/pci0000:00/0000:00:03.1/0000:03:00.0/0000:04:00.0/0000:05:00.0 SUBSYSTEM=pci RESET=0 DRIVER=amdgpu PCI_CLASS=30000 PCI_ID=1002:73BF PCI_SUBSYS_ID=1002:0E3A PCI_SLOT_NAME=0000:05:00.0 MODALIAS=pci:v00001002d000073BFsv00001002sd00000E3Abc03sc00i00 SEQNUM=6236 Signed-off-by: Yang Wang <[email protected]> --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 39 ++++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 2a0df4cabb99..73c946d9cbe1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1805,4 +1805,7 @@ void amdgpu_device_set_uid(struct amdgpu_uid *uid_info, uint64_t uid); uint64_t amdgpu_device_get_uid(struct amdgpu_uid *uid_info, enum amdgpu_uid_type type, uint8_t inst); + +int amdgpu_device_uevent_reset(struct amdgpu_device *adev, bool done); + #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index a77000c2e0bb..300cc22dad91 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -6318,6 +6318,7 @@ static int amdgpu_device_asic_reset(struct amdgpu_device *adev, retry: /* Rest of adevs pre asic reset from XGMI hive. */ list_for_each_entry(tmp_adev, device_list, reset_list) { + amdgpu_device_uevent_reset(tmp_adev, false); r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); /*TODO Should we stop ?*/ if (r) { @@ -6362,6 +6363,8 @@ static int amdgpu_device_asic_reset(struct amdgpu_device *adev, * in before drm_sched_start. */ amdgpu_device_stop_pending_resets(tmp_adev); + + amdgpu_device_uevent_reset(tmp_adev, true); } return r; @@ -7669,3 +7672,39 @@ u64 amdgpu_device_get_uid(struct amdgpu_uid *uid_info, return uid_info->uid[type][inst]; } + +__printf(3, 4) +static int amdgpu_device_uevent_emit(struct amdgpu_device *adev, enum kobject_action action, + char *fmt, ...) +{ + struct kobject *kobj = &adev->dev->kobj; + char *uevent_env[2], *tmp; + va_list ap; + int ret; + + va_start(ap, fmt); + tmp = kvasprintf(GFP_KERNEL, fmt, ap); + va_end(ap); + + if (!tmp) { + ret = -ENOMEM; + goto out; + } + + uevent_env[0] = tmp; + uevent_env[1] = NULL; + + ret = kobject_uevent_env(kobj, action, uevent_env); + + kvfree(tmp); + +out: + return ret; +} + +int amdgpu_device_uevent_reset(struct amdgpu_device *adev, bool done) { + int val = done ? 0 : 1; + + return amdgpu_device_uevent_emit(adev, KOBJ_CHANGE, "RESET=%d", val); +} -- 2.34.1
