AMD General Seems fine to me. Will wait for Lijo and David to chime in on their v2/v3 change requests before I give a Reviewed-By
Kent > -----Original Message----- > From: amd-gfx <[email protected]> On Behalf Of Zhang, > Yifan > Sent: May 24, 2026 7:18 AM > To: [email protected] > Cc: Kuehling, Felix <[email protected]>; Deucher, Alexander > <[email protected]>; Koenig, Christian > <[email protected]>; Yat Sin, David <[email protected]>; Lazar, > Lijo <[email protected]> > Subject: RE: [PATCH v4] drm/amdgpu: add ioctl to handle RAS poison error > > AMD General > > ping > > -----Original Message----- > From: Zhang, Yifan <[email protected]> > Sent: Thursday, May 21, 2026 5:05 PM > To: [email protected] > Cc: Kuehling, Felix <[email protected]>; Deucher, Alexander > <[email protected]>; Koenig, Christian > <[email protected]>; Yat Sin, David <[email protected]>; Lazar, > Lijo <[email protected]>; Zhang, Yifan <[email protected]> > Subject: [PATCH v4] drm/amdgpu: add ioctl to handle RAS poison error > > Add a new DRM_IOCTL_AMDGPU_USER_OPTIONS ioctl with the > AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY option, allowing userspace > (ROCr) to control per-process SIGBUS delivery. > > Userspace for this can be found at: > https://github.com/ROCm/rocm-systems/pull/6190 > > Signed-off-by: Yifan Zhang <[email protected]> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 + > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 6 ++ > drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 27 +++++++++ > drivers/gpu/drm/amd/amdkfd/kfd_events.c | 70 +++++++++++++++++++++- > drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 14 +++++ > drivers/gpu/drm/amd/amdkfd/kfd_process.c | 23 +++++++ > include/uapi/drm/amdgpu_drm.h | 21 +++++++ > 8 files changed, 163 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index 5d7bfa59424a..771ec0608270 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -1467,6 +1467,8 @@ int amdgpu_enable_vblank_kms(struct drm_crtc > *crtc); void amdgpu_disable_vblank_kms(struct drm_crtc *crtc); int > amdgpu_info_ioctl(struct drm_device *dev, void *data, > struct drm_file *filp); > +int amdgpu_user_options_ioctl(struct drm_device *dev, void *data, > + struct drm_file *filp); > > /* > * functions used by amdgpu_encoder.c > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > index 5333e052d56d..68d83a6e6b3a 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > @@ -210,6 +210,7 @@ int amdgpu_amdkfd_evict_userptr(struct > mmu_interval_notifier *mni, int amdgpu_amdkfd_bo_validate_and_fence(struct > amdgpu_bo *bo, > uint32_t domain, > struct dma_fence *fence); > +int amdgpu_amdkfd_set_sigbus_delay(struct task_struct *task, u32 ms); > #else > static inline > bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct *mm) > @@ -241,6 +242,11 @@ int amdgpu_amdkfd_bo_validate_and_fence(struct > amdgpu_bo *bo, { > return 0; > } > +static inline > +int amdgpu_amdkfd_set_sigbus_delay(struct task_struct *task, u32 ms) { > + return -EOPNOTSUPP; > +} > #endif > /* Shared API */ > int amdgpu_amdkfd_alloc_kernel_mem(struct amdgpu_device *adev, size_t size, > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > index 1781c0c3d010..4d4d21babc61 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > @@ -3076,6 +3076,7 @@ const struct drm_ioctl_desc amdgpu_ioctls_kms[] = { > DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_SIGNAL, > amdgpu_userq_signal_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), > DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_WAIT, amdgpu_userq_wait_ioctl, > DRM_AUTH|DRM_RENDER_ALLOW), > DRM_IOCTL_DEF_DRV(AMDGPU_GEM_LIST_HANDLES, > amdgpu_gem_list_handles_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), > + DRM_IOCTL_DEF_DRV(AMDGPU_USER_OPTIONS, > amdgpu_user_options_ioctl, > +DRM_AUTH|DRM_RENDER_ALLOW), > }; > > static const struct drm_driver amdgpu_kms_driver = { diff --git > a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > index 24526e92f9b8..772e0fda7e14 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > @@ -1423,6 +1423,33 @@ int amdgpu_info_ioctl(struct drm_device *dev, void > *data, struct drm_file *filp) > return 0; > } > > +/** > + * amdgpu_user_options_ioctl - set per-fd user options > + * > + * @dev: drm dev pointer > + * @data: pointer to struct drm_amdgpu_user_options > + * @filp: drm file > + * > + * Sets options stored on the per-file amdgpu_fpriv. Currently the only > + * supported option is %AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY > which > + * controls how KFD delivers SIGBUS for poison/RAS events to the > +calling > + * process (immediate, suppressed, or delayed by N milliseconds). > + */ > +int amdgpu_user_options_ioctl(struct drm_device *dev, void *data, > + struct drm_file *filp) > +{ > + struct drm_amdgpu_user_options *args = data; > + > + switch (args->op) { > + case AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY: > + return amdgpu_amdkfd_set_sigbus_delay(current, > + > args->kfd_sigbus_delay.value); > + default: > + DRM_DEBUG_KMS("Invalid user option op %u\n", args->op); > + return -EINVAL; > + } > +} > + > /** > * amdgpu_driver_open_kms - drm callback for open > * > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c > b/drivers/gpu/drm/amd/amdkfd/kfd_events.c > index e9be798c0a2b..e7d70e3a7f3e 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c > @@ -29,10 +29,12 @@ > #include <linux/uaccess.h> > #include <linux/mman.h> > #include <linux/memory.h> > +#include <linux/workqueue.h> > #include "kfd_priv.h" > #include "kfd_events.h" > #include "kfd_device_queue_manager.h" > #include <linux/device.h> > +#include <uapi/drm/amdgpu_drm.h> > > /* > * Wrapper around wait_queue_entry_t > @@ -1337,6 +1339,72 @@ void kfd_signal_reset_event(struct kfd_node *dev) > srcu_read_unlock(&kfd_processes_srcu, idx); } > > +/* > + * Per-process opt-in for poison-consumption SIGBUS handling. > + * > + * Default: kernel sends SIGBUS to the process immediately when poison > +is > + * consumed, in addition to delivering the KFD HW/MEMORY exception events. > + * > + * Userspace (ROCr) can opt-in per-process via the > + * DRM_IOCTL_AMDGPU_USER_OPTIONS / > +AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY > + * option. This lets the app's registered system-event callback handle > +the > + * RAS error first, instead of being killed by SIGBUS. > + * > + * Encoded value (stored on the kfd_process): > + * 0 - default: SIGBUS immediately (no opt-in) > + * 0xFFFFFFFF - opt-in, never escalate to SIGBUS > + * N (other) - opt-in, escalate to SIGBUS after N ms if app does not > + * handle the error in time (safety timeout) > + */ > + > +static void kfd_signal_sigbus_delayed_fn(struct work_struct *work) { > + struct kfd_process_device *pdd = container_of(to_delayed_work(work), > + struct kfd_process_device, work); > + struct kfd_process *p = pdd->process; > + > + if (p->lead_thread) > + send_sig(SIGBUS, p->lead_thread, 0); > + > + kfd_unref_process(p); > +} > + > +static void kfd_signal_sigbus_with_delay(struct kfd_node *dev, > + struct kfd_process *p) > +{ > + struct kfd_process_device *pdd; > + u32 delay_ms = atomic_read(&p->kfd_sigbus_delay_ms); > + > + if (delay_ms == AMDGPU_USER_OPTIONS_KFD_SIGBUS_DELAY_DISABLED) > { > + dev_info(dev->adev->dev, > + "SIGBUS suppressed for process %s(pid:%d): app opted > in to handle > RAS error\n", > + p->lead_thread->comm, p->lead_thread->pid); > + return; > + } > + > + if (delay_ms == 0) > + goto send_now; > + > + pdd = kfd_get_process_device_data(dev, p); > + if (!pdd) { > + dev_err(dev->adev->dev, "Process device data doesn't > exist\n"); > + goto send_now; > + } > + > + /* Take an extra reference for the delayed worker. */ > + kref_get(&p->ref); > + INIT_DELAYED_WORK(&pdd->work, kfd_signal_sigbus_delayed_fn); > + > + dev_info(dev->adev->dev, > + "Deferring SIGBUS to process %s(pid:%d) by %u ms (RAS error > opt-in > safety timeout)\n", > + p->lead_thread->comm, p->lead_thread->pid, delay_ms); > + schedule_delayed_work(&pdd->work, msecs_to_jiffies(delay_ms)); > + return; > + > +send_now: > + send_sig(SIGBUS, p->lead_thread, 0); > +} > + > void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid) { > struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL); @@ - > 1391,7 +1459,7 @@ void kfd_signal_poison_consumed_event(struct kfd_node > *dev, u32 pasid) > rcu_read_unlock(); > > /* user application will handle SIGBUS signal */ > - send_sig(SIGBUS, p->lead_thread, 0); > + kfd_signal_sigbus_with_delay(dev, p); > > kfd_unref_process(p); > } > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > index f037062c33ea..d3fcf07c0ebe 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > @@ -875,6 +875,9 @@ struct kfd_process_device { > u32 pasid; > /* Indicates this process has requested PTL stay disabled */ > bool ptl_disable_req; > + > + /* Delayed signal to user */ > + struct delayed_work work; > }; > > #define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd) @@ - > 957,6 +960,17 @@ struct kfd_process { > size_t signal_event_count; > bool signal_event_limit_reached; > > + /** > + * @kfd_sigbus_delay_ms: Per-process KFD SIGBUS delivery option for > + * poison/RAS events (set via DRM_IOCTL_AMDGPU_USER_OPTIONS / > + * AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY). > + * > + * 0 - send SIGBUS immediately (default) > + * 0xFFFFFFFF - suppress SIGBUS delivery > + * other - delay SIGBUS delivery by this many milliseconds > + */ > + atomic_t kfd_sigbus_delay_ms; > + > /* Information used for memory eviction */ > void *kgd_process_info; > /* Eviction fence that is attached to all the BOs of this process. > The diff --git > a/drivers/gpu/drm/amd/amdkfd/kfd_process.c > b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > index 419bb8086ccd..dadb7cf7b072 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > @@ -992,6 +992,29 @@ struct kfd_process *kfd_create_process(struct > task_struct *thread) > return process; > } > > +/** > + * amdgpu_amdkfd_set_sigbus_delay - Set per-process KFD SIGBUS delay > + * @task: task in the target process > + * @ms: encoded delay value (0 = immediate, 0xFFFFFFFF = suppress, > + * otherwise delay in milliseconds) > + * > + * Stores the SIGBUS delivery option on the kfd_process associated with > + * @task. If no kfd_process exists yet, one is created so the option > + * persists until poison/RAS events are signaled. > + */ > +int amdgpu_amdkfd_set_sigbus_delay(struct task_struct *task, u32 ms) { > + struct kfd_process *p; > + > + p = kfd_create_process(task); > + if (IS_ERR(p)) > + return PTR_ERR(p); > + > + atomic_set(&p->kfd_sigbus_delay_ms, ms); > + kfd_unref_process(p); > + return 0; > +} > + > static struct kfd_process *find_process_by_mm(const struct mm_struct *mm) { > struct kfd_process *process; > diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h > index 9f3090db2f16..ab71c4b4aeac 100644 > --- a/include/uapi/drm/amdgpu_drm.h > +++ b/include/uapi/drm/amdgpu_drm.h > @@ -58,6 +58,7 @@ extern "C" { > #define DRM_AMDGPU_USERQ_SIGNAL 0x17 > #define DRM_AMDGPU_USERQ_WAIT 0x18 > #define DRM_AMDGPU_GEM_LIST_HANDLES 0x19 > +#define DRM_AMDGPU_USER_OPTIONS 0x1A > > #define DRM_IOCTL_AMDGPU_GEM_CREATE > DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_CREATE, union > drm_amdgpu_gem_create) > #define DRM_IOCTL_AMDGPU_GEM_MMAP > DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_MMAP, union > drm_amdgpu_gem_mmap) > @@ -79,6 +80,7 @@ extern "C" { > #define DRM_IOCTL_AMDGPU_USERQ_SIGNAL > DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USERQ_SIGNAL, struct > drm_amdgpu_userq_signal) > #define DRM_IOCTL_AMDGPU_USERQ_WAIT > DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USERQ_WAIT, struct > drm_amdgpu_userq_wait) > #define DRM_IOCTL_AMDGPU_GEM_LIST_HANDLES > DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_LIST_HANDLES, > struct drm_amdgpu_gem_list_handles) > +#define DRM_IOCTL_AMDGPU_USER_OPTIONS > DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USER_OPTIONS, struct > drm_amdgpu_user_options) > > /** > * DOC: memory domains > @@ -1673,6 +1675,25 @@ struct drm_amdgpu_info_uq_metadata { > #define AMDGPU_FAMILY_GC_11_5_4 154 /* GC 11.5.4 */ > #define AMDGPU_FAMILY_GC_12_0_0 152 /* GC 12.0.0 */ > > +/* > + * Definition of user options > + * > + * option: AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY > + * 0: Disable sigbus delay - SIGBUS will be raised immediately > + * 0xFFFFFFFF: SIGBUS will not be raised > + * other: Set the sigbus delay in milliseconds > + */ > +#define AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY 0 > + > +#define AMDGPU_USER_OPTIONS_KFD_SIGBUS_DELAY_DISABLED > 0xFFFFFFFFu > + > +struct drm_amdgpu_user_options { > + __u32 op; > + struct { > + __u32 value; > + } kfd_sigbus_delay; > +}; > + > #if defined(__cplusplus) > } > #endif > -- > 2.43.0
