RE: [PATCH] drm/amdgpu: notify amdgpu gpu reset state via uevent

Lazar, Lijo Thu, 25 Sep 2025 23:55:32 -0700

[Public]

Presently, there is this one also - drm_dev_wedged_event. Perhaps it's better 
to modify this to include additional info like pre and post reset along with 
cause of reset?


Thanks,
Lijo
-----Original Message-----
From: amd-gfx <[email protected]> On Behalf Of Yang Wang
Sent: Friday, September 26, 2025 12:04 PM
To: [email protected]
Cc: Zhang, Hawking <[email protected]>; Deucher, Alexander 
<[email protected]>
Subject: [PATCH] drm/amdgpu: notify amdgpu gpu reset state via uevent

Use the uevent mechanism to expose the GPU reset state, so that the system tool 
can more accurately monitor the device reset status.

example:
$ sudo cat /sys/kernel/debug/dri/<minor>/amdgpu_gpu_recover

KERNEL[172.053149] change   
/devices/pci0000:00/0000:00:03.1/0000:03:00.0/0000:04:00.0/0000:05:00.0 (pci)
ACTION=change
DEVPATH=/devices/pci0000:00/0000:00:03.1/0000:03:00.0/0000:04:00.0/0000:05:00.0
SUBSYSTEM=pci
RESET=1
DRIVER=amdgpu
PCI_CLASS=30000
PCI_ID=1002:73BF
PCI_SUBSYS_ID=1002:0E3A
PCI_SLOT_NAME=0000:05:00.0
MODALIAS=pci:v00001002d000073BFsv00001002sd00000E3Abc03sc00i00
SEQNUM=6235

KERNEL[173.137681] change   
/devices/pci0000:00/0000:00:03.1/0000:03:00.0/0000:04:00.0/0000:05:00.0 (pci)
ACTION=change
DEVPATH=/devices/pci0000:00/0000:00:03.1/0000:03:00.0/0000:04:00.0/0000:05:00.0
SUBSYSTEM=pci
RESET=0
DRIVER=amdgpu
PCI_CLASS=30000
PCI_ID=1002:73BF
PCI_SUBSYS_ID=1002:0E3A
PCI_SLOT_NAME=0000:05:00.0
MODALIAS=pci:v00001002d000073BFsv00001002sd00000E3Abc03sc00i00
SEQNUM=6236

Signed-off-by: Yang Wang <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  3 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 39 ++++++++++++++++++++++
 2 files changed, 42 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 2a0df4cabb99..73c946d9cbe1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1805,4 +1805,7 @@ void amdgpu_device_set_uid(struct amdgpu_uid *uid_info,
                           uint64_t uid);
 uint64_t amdgpu_device_get_uid(struct amdgpu_uid *uid_info,
                               enum amdgpu_uid_type type, uint8_t inst);
+
+int amdgpu_device_uevent_reset(struct amdgpu_device *adev, bool done);
+
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index a77000c2e0bb..300cc22dad91 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -6318,6 +6318,7 @@ static int amdgpu_device_asic_reset(struct amdgpu_device 
*adev,

 retry: /* Rest of adevs pre asic reset from XGMI hive. */
        list_for_each_entry(tmp_adev, device_list, reset_list) {
+               amdgpu_device_uevent_reset(tmp_adev, false);
                r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
                /*TODO Should we stop ?*/
                if (r) {
@@ -6362,6 +6363,8 @@ static int amdgpu_device_asic_reset(struct amdgpu_device 
*adev,
                 * in before drm_sched_start.
                 */
                amdgpu_device_stop_pending_resets(tmp_adev);
+
+               amdgpu_device_uevent_reset(tmp_adev, true);
        }

        return r;
@@ -7669,3 +7672,39 @@ u64 amdgpu_device_get_uid(struct amdgpu_uid *uid_info,

        return uid_info->uid[type][inst];
 }
+
+__printf(3, 4)
+static int amdgpu_device_uevent_emit(struct amdgpu_device *adev, enum 
kobject_action action,
+                                    char *fmt, ...)
+{
+       struct kobject *kobj = &adev->dev->kobj;
+       char *uevent_env[2], *tmp;
+       va_list ap;
+       int ret;
+
+       va_start(ap, fmt);
+       tmp = kvasprintf(GFP_KERNEL, fmt, ap);
+       va_end(ap);
+
+       if (!tmp) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       uevent_env[0] = tmp;
+       uevent_env[1] = NULL;
+
+       ret = kobject_uevent_env(kobj, action, uevent_env);
+
+       kvfree(tmp);
+
+out:
+       return ret;
+}
+
+int amdgpu_device_uevent_reset(struct amdgpu_device *adev, bool done) {
+       int val = done ? 0 : 1;
+
+       return amdgpu_device_uevent_emit(adev, KOBJ_CHANGE, "RESET=%d", val);
+}
--
2.34.1

RE: [PATCH] drm/amdgpu: notify amdgpu gpu reset state via uevent

Reply via email to