Use the uevent mechanism to expose the GPU reset state, so that the system tool can more accurately monitor the device reset status.
example: $ sudo cat /sys/kernel/debug/dri/<minor>/amdgpu_gpu_recover KERNEL[172.053149] change /devices/pci0000:00/0000:00:03.1/0000:03:00.0/0000:04:00.0/0000:05:00.0 (pci) ACTION=change DEVPATH=/devices/pci0000:00/0000:00:03.1/0000:03:00.0/0000:04:00.0/0000:05:00.0 SUBSYSTEM=pci RESET=1 DRIVER=amdgpu PCI_CLASS=30000 PCI_ID=1002:73BF PCI_SUBSYS_ID=1002:0E3A PCI_SLOT_NAME=0000:05:00.0 MODALIAS=pci:v00001002d000073BFsv00001002sd00000E3Abc03sc00i00 SEQNUM=6235 KERNEL[173.137681] change /devices/pci0000:00/0000:00:03.1/0000:03:00.0/0000:04:00.0/0000:05:00.0 (pci) ACTION=change DEVPATH=/devices/pci0000:00/0000:00:03.1/0000:03:00.0/0000:04:00.0/0000:05:00.0 SUBSYSTEM=pci RESET=0 DRIVER=amdgpu PCI_CLASS=30000 PCI_ID=1002:73BF PCI_SUBSYS_ID=1002:0E3A PCI_SLOT_NAME=0000:05:00.0 MODALIAS=pci:v00001002d000073BFsv00001002sd00000E3Abc03sc00i00 SEQNUM=6236 Signed-off-by: Yang Wang <[email protected]> --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 39 ++++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 2a0df4cabb99..73c946d9cbe1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1805,4 +1805,7 @@ void amdgpu_device_set_uid(struct amdgpu_uid *uid_info, uint64_t uid); uint64_t amdgpu_device_get_uid(struct amdgpu_uid *uid_info, enum amdgpu_uid_type type, uint8_t inst); + +int amdgpu_device_uevent_reset(struct amdgpu_device *adev, bool done); + #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index a77000c2e0bb..300cc22dad91 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -6318,6 +6318,7 @@ static int amdgpu_device_asic_reset(struct amdgpu_device *adev, retry: /* Rest of adevs pre asic reset from XGMI hive. */ list_for_each_entry(tmp_adev, device_list, reset_list) { + amdgpu_device_uevent_reset(tmp_adev, false); r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); /*TODO Should we stop ?*/ if (r) { @@ -6362,6 +6363,8 @@ static int amdgpu_device_asic_reset(struct amdgpu_device *adev, * in before drm_sched_start. */ amdgpu_device_stop_pending_resets(tmp_adev); + + amdgpu_device_uevent_reset(tmp_adev, true); } return r; @@ -7669,3 +7672,39 @@ u64 amdgpu_device_get_uid(struct amdgpu_uid *uid_info, return uid_info->uid[type][inst]; } + +__printf(3, 4) +static int amdgpu_device_uevent_emit(struct amdgpu_device *adev, enum kobject_action action, + char *fmt, ...) +{ + struct kobject *kobj = &adev->dev->kobj; + char *uevent_env[2], *tmp; + va_list ap; + int ret; + + va_start(ap, fmt); + tmp = kvasprintf(GFP_KERNEL, fmt, ap); + va_end(ap); + + if (!tmp) { + ret = -ENOMEM; + goto out; + } + + uevent_env[0] = tmp; + uevent_env[1] = NULL; + + ret = kobject_uevent_env(kobj, action, uevent_env); + + kvfree(tmp); + +out: + return ret; +} + +int amdgpu_device_uevent_reset(struct amdgpu_device *adev, bool done) +{ + int val = done ? 0 : 1; + + return amdgpu_device_uevent_emit(adev, KOBJ_CHANGE, "RESET=%d", val); +} -- 2.34.1
