AMD General I think the kfd_sigbus_delay should be uint32_t.
uint16_t is only ~1 minute. Once userspace gets the first exception event, it may start generating a coredump file and the coredump file generation can take > 40 minutes. ~David > -----Original Message----- > From: Alex Deucher <[email protected]> > Sent: Tuesday, May 19, 2026 9:55 AM > To: Zhang, Yifan <[email protected]> > Cc: [email protected]; Deucher, Alexander > <[email protected]>; Koenig, Christian <[email protected]>; > Kuehling, Felix <[email protected]>; Yat Sin, David > <[email protected]>; Russell, Kent <[email protected]>; Yuan, Perry > <[email protected]> > Subject: Re: [PATCH v3] drm/amdgpu: add ioctl to handle RAS poison error > > On Sun, May 17, 2026 at 1:44 AM Yifan Zhang <[email protected]> wrote: > > > > Add a new DRM_IOCTL_AMDGPU_USER_OPTIONS ioctl with the > > AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY option, allowing > userspace > > (ROCr) to control per-process SIGBUS delivery. > > > > Userspace for this can be found at: > > https://github.com/ROCm/rocm-systems/pull/6190 > > > > Signed-off-by: Yifan Zhang <[email protected]> > > --- > > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 12 +++ > > drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 + > > drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 29 ++++++ > > drivers/gpu/drm/amd/amdkfd/kfd_events.c | 118 +++++++++++++++++++++++- > > include/uapi/drm/amdgpu_drm.h | 24 +++++ > > 5 files changed, 182 insertions(+), 2 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > > index 5d7bfa59424a..0408476f1070 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > > @@ -455,6 +455,16 @@ struct amdgpu_fpriv { > > > > /** GPU partition selection */ > > uint32_t xcp_id; > > + > > + /** > > + * @kfd_sigbus_delay_ms: Per-fd KFD SIGBUS delivery option (set via > > + * DRM_IOCTL_AMDGPU_USER_OPTIONS / > AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY). > > + * > > + * 0 - send SIGBUS immediately (default) > > + * 0xFFFF - suppress SIGBUS delivery > > + * other - delay SIGBUS delivery by this many milliseconds > > + */ > > + atomic_t kfd_sigbus_delay_ms; > > }; > > > > int amdgpu_file_to_fpriv(struct file *filp, struct amdgpu_fpriv > > **fpriv); @@ -1467,6 +1477,8 @@ int amdgpu_enable_vblank_kms(struct > > drm_crtc *crtc); void amdgpu_disable_vblank_kms(struct drm_crtc > > *crtc); int amdgpu_info_ioctl(struct drm_device *dev, void *data, > > struct drm_file *filp); > > +int amdgpu_user_options_ioctl(struct drm_device *dev, void *data, > > + struct drm_file *filp); > > > > /* > > * functions used by amdgpu_encoder.c diff --git > > a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > > index 99688391e70b..cad18bd6f8b3 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > > @@ -3078,6 +3078,7 @@ const struct drm_ioctl_desc amdgpu_ioctls_kms[] = { > > DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_SIGNAL, > amdgpu_userq_signal_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), > > DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_WAIT, > amdgpu_userq_wait_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), > > DRM_IOCTL_DEF_DRV(AMDGPU_GEM_LIST_HANDLES, > > amdgpu_gem_list_handles_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), > > + DRM_IOCTL_DEF_DRV(AMDGPU_USER_OPTIONS, > > + amdgpu_user_options_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), > > }; > > > > static const struct drm_driver amdgpu_kms_driver = { diff --git > > a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > > index 24526e92f9b8..7903587b8bbb 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > > @@ -1423,6 +1423,35 @@ int amdgpu_info_ioctl(struct drm_device *dev, void > *data, struct drm_file *filp) > > return 0; > > } > > > > +/** > > + * amdgpu_user_options_ioctl - set per-fd user options > > + * > > + * @dev: drm dev pointer > > + * @data: pointer to struct drm_amdgpu_user_options > > + * @filp: drm file > > + * > > + * Sets options stored on the per-file amdgpu_fpriv. Currently the > > +only > > + * supported option is > %AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY which > > + * controls how KFD delivers SIGBUS for poison/RAS events to the > > +calling > > + * process (immediate, suppressed, or delayed by N milliseconds). > > + */ > > +int amdgpu_user_options_ioctl(struct drm_device *dev, void *data, > > + struct drm_file *filp) { > > + struct amdgpu_fpriv *fpriv = filp->driver_priv; > > + struct drm_amdgpu_user_options *args = data; > > + > > + switch (args->op) { > > + case AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY: > > + atomic_set(&fpriv->kfd_sigbus_delay_ms, > > + args->kfd_sigbus_delay.value); > > + return 0; > > + default: > > + DRM_DEBUG_KMS("Invalid user option op %u\n", args->op); > > + return -EINVAL; > > + } > > +} > > + > > /** > > * amdgpu_driver_open_kms - drm callback for open > > * > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c > > b/drivers/gpu/drm/amd/amdkfd/kfd_events.c > > index e9be798c0a2b..200570401f51 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c > > @@ -29,10 +29,12 @@ > > #include <linux/uaccess.h> > > #include <linux/mman.h> > > #include <linux/memory.h> > > +#include <linux/workqueue.h> > > #include "kfd_priv.h" > > #include "kfd_events.h" > > #include "kfd_device_queue_manager.h" > > #include <linux/device.h> > > +#include <uapi/drm/amdgpu_drm.h> > > > > /* > > * Wrapper around wait_queue_entry_t > > @@ -1337,6 +1339,119 @@ void kfd_signal_reset_event(struct kfd_node *dev) > > srcu_read_unlock(&kfd_processes_srcu, idx); } > > > > +/* > > + * Per-process opt-in for poison-consumption SIGBUS handling. > > + * > > + * Default: kernel sends SIGBUS to the process immediately when > > +poison is > > + * consumed, in addition to delivering the KFD HW/MEMORY exception events. > > + * > > + * Userspace (ROCr) can opt-in per-process via the > > + * DRM_IOCTL_AMDGPU_USER_OPTIONS / > > +AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY > > + * option. This lets the app's registered system-event callback > > +handle the > > + * RAS error first, instead of being killed by SIGBUS. > > + * > > + * Encoded value (set on any of the process' amdgpu render fds): > > + * 0 - default: SIGBUS immediately (no opt-in) > > + * 0xFFFF - opt-in, never escalate to SIGBUS > > + * N (other) - opt-in, escalate to SIGBUS after N ms if app does not > > + * handle the error in time (safety timeout) > > + * > > + * Per-process scope: the option is honored if ANY of the process' > > +amdgpu > > + * fds has been configured. This matches the slide deck's > > +"Per-process, > > + * App set at init" semantics, while keeping the UAPI on amdgpu where > > +ROCr > > + * sets it. > > + */ > > +struct kfd_sigbus_delayed_work { > > + struct delayed_work work; > > + struct kfd_process *p; > > +}; > > + > > +static void kfd_signal_sigbus_delayed_fn(struct work_struct *work) { > > + struct kfd_sigbus_delayed_work *dw = > > container_of(to_delayed_work(work), > > + struct kfd_sigbus_delayed_work, work); > > + struct kfd_process *p = dw->p; > > + > > + if (p->lead_thread) > > + send_sig(SIGBUS, p->lead_thread, 0); > > + > > + kfd_unref_process(p); > > + kfree(dw); > > +} > > + > > +/* > > + * Resolve the per-process SIGBUS opt-in setting by scanning all of > > +the > > + * process' KFD pdds (each backed by an amdgpu render fd). Returns > > +the > > + * "most lenient" value across all fds, in this priority: > > + * DISABLED (no SIGBUS) > any non-zero timeout > 0 (immediate) > > + * > > + * Rationale: if the app has explicitly opted in on any GPU it uses, > > +it > > + * wants the chance to handle the error in userspace. > > + */ > > +static u16 kfd_get_sigbus_delay_ms(struct kfd_process *p) { > > + u16 result = 0; > > + int i; > > + > > + mutex_lock(&p->mutex); > > + for (i = 0; i < p->n_pdds; i++) { > > + struct kfd_process_device *pdd = p->pdds[i]; > > + struct amdgpu_fpriv *drv_priv; > > + u16 v; > > + > > + if (!pdd || !pdd->drm_file) > > + continue; > > + if (amdgpu_file_to_fpriv(pdd->drm_file, &drv_priv)) > > + continue; > > + > > + v = atomic_read(&drv_priv->kfd_sigbus_delay_ms); > > + if (v == > AMDGPU_USER_OPTIONS_KFD_SIGBUS_DELAY_DISABLED) { > > + result = v; > > + break; > > + } > > + if (v > result) > > + result = v; > > + } > > + mutex_unlock(&p->mutex); > > + > > + return result; > > +} > > + > > +static void kfd_signal_sigbus_with_delay(struct kfd_node *dev, > > + struct kfd_process *p) { > > + u16 delay_ms = kfd_get_sigbus_delay_ms(p); > > + struct kfd_sigbus_delayed_work *dw; > > + > > + if (delay_ms == > AMDGPU_USER_OPTIONS_KFD_SIGBUS_DELAY_DISABLED) { > > + dev_info(dev->adev->dev, > > + "SIGBUS suppressed for process %s(pid:%d): app > > opted in to > handle RAS error\n", > > + p->lead_thread->comm, p->lead_thread->pid); > > + return; > > + } > > + > > + if (delay_ms == 0) > > + goto send_now; > > + > > + dw = kzalloc(sizeof(*dw), GFP_ATOMIC); > > + if (!dw) > > + goto send_now; > > + > > + /* Take an extra reference for the delayed worker. */ > > + kref_get(&p->ref); > > + dw->p = p; > > + INIT_DELAYED_WORK(&dw->work, kfd_signal_sigbus_delayed_fn); > > + > > + dev_info(dev->adev->dev, > > + "Deferring SIGBUS to process %s(pid:%d) by %u ms (RAS > > error opt- > in safety timeout)\n", > > + p->lead_thread->comm, p->lead_thread->pid, delay_ms); > > + schedule_delayed_work(&dw->work, msecs_to_jiffies(delay_ms)); > > + return; > > + > > +send_now: > > + send_sig(SIGBUS, p->lead_thread, 0); > > > Probably worth adding a comment here that this feature is not supported with > confidential compute. Other than that, looks good to me. > > Alex > > > +} > > + > > void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 > > pasid) { > > struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, > > NULL); @@ -1345,7 +1460,6 @@ void > kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid) > > struct kfd_event *ev; > > uint32_t id = KFD_FIRST_NONSIGNAL_EVENT_ID; > > int user_gpu_id; > > - > > if (!p) { > > dev_warn(dev->adev->dev, "Not find process with > > pasid:%d\n", pasid); > > return; /* Presumably process exited. */ @@ -1391,7 > > +1505,7 @@ void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 > pasid) > > rcu_read_unlock(); > > > > /* user application will handle SIGBUS signal */ > > - send_sig(SIGBUS, p->lead_thread, 0); > > + kfd_signal_sigbus_with_delay(dev, p); > > > > kfd_unref_process(p); > > } > > diff --git a/include/uapi/drm/amdgpu_drm.h > > b/include/uapi/drm/amdgpu_drm.h index 9f3090db2f16..e88d7cf53858 > > 100644 > > --- a/include/uapi/drm/amdgpu_drm.h > > +++ b/include/uapi/drm/amdgpu_drm.h > > @@ -58,6 +58,7 @@ extern "C" { > > #define DRM_AMDGPU_USERQ_SIGNAL 0x17 > > #define DRM_AMDGPU_USERQ_WAIT 0x18 > > #define DRM_AMDGPU_GEM_LIST_HANDLES 0x19 > > +#define DRM_AMDGPU_USER_OPTIONS 0x1A > > > > #define DRM_IOCTL_AMDGPU_GEM_CREATE > DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_CREATE, union > drm_amdgpu_gem_create) > > #define DRM_IOCTL_AMDGPU_GEM_MMAP > DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_MMAP, union > drm_amdgpu_gem_mmap) > > @@ -79,6 +80,7 @@ extern "C" { > > #define DRM_IOCTL_AMDGPU_USERQ_SIGNAL > DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USERQ_SIGNAL, struct > drm_amdgpu_userq_signal) > > #define DRM_IOCTL_AMDGPU_USERQ_WAIT > DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USERQ_WAIT, struct > drm_amdgpu_userq_wait) > > #define DRM_IOCTL_AMDGPU_GEM_LIST_HANDLES > DRM_IOWR(DRM_COMMAND_BASE + > > DRM_AMDGPU_GEM_LIST_HANDLES, struct drm_amdgpu_gem_list_handles) > > +#define DRM_IOCTL_AMDGPU_USER_OPTIONS > DRM_IOWR(DRM_COMMAND_BASE + > > +DRM_AMDGPU_USER_OPTIONS, struct drm_amdgpu_user_options) > > > > /** > > * DOC: memory domains > > @@ -1673,6 +1675,28 @@ struct drm_amdgpu_info_uq_metadata { > > #define AMDGPU_FAMILY_GC_11_5_4 154 /* GC 11.5.4 */ > > #define AMDGPU_FAMILY_GC_12_0_0 152 /* GC 12.0.0 */ > > > > +/* > > + * Definition of user options > > + * > > + * option: AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY > > + * 0: Disable sigbus delay - SIGBUS will be raised immediately > > + * 0xFFFF: SIGBUS will not be raised > > + * other: Set the sigbus delay in milliseconds > > + */ > > +#define AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY 0 > > + > > +#define AMDGPU_USER_OPTIONS_KFD_SIGBUS_DELAY_DISABLED > 0xFFFFu > > + > > +struct drm_amdgpu_user_options { > > + __u32 op; > > + union { > > + struct { > > + __u16 value; > > + __u16 _pad; > > + } kfd_sigbus_delay; > > + }; > > +}; > > + > > #if defined(__cplusplus) > > } > > #endif > > -- > > 2.43.0 > >
