On Tue, May 19, 2026 at 11:16 AM Yat Sin, David <[email protected]> wrote: > > AMD General > > I think the kfd_sigbus_delay should be uint32_t. > > uint16_t is only ~1 minute. Once userspace gets the first exception event, it > may start generating a coredump file and the coredump file generation can > take > 40 minutes. >
Ah, ok. That makes sense. Alex > ~David > > > > -----Original Message----- > > From: Alex Deucher <[email protected]> > > Sent: Tuesday, May 19, 2026 9:55 AM > > To: Zhang, Yifan <[email protected]> > > Cc: [email protected]; Deucher, Alexander > > <[email protected]>; Koenig, Christian <[email protected]>; > > Kuehling, Felix <[email protected]>; Yat Sin, David > > <[email protected]>; Russell, Kent <[email protected]>; Yuan, Perry > > <[email protected]> > > Subject: Re: [PATCH v3] drm/amdgpu: add ioctl to handle RAS poison error > > > > On Sun, May 17, 2026 at 1:44 AM Yifan Zhang <[email protected]> wrote: > > > > > > Add a new DRM_IOCTL_AMDGPU_USER_OPTIONS ioctl with the > > > AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY option, allowing > > userspace > > > (ROCr) to control per-process SIGBUS delivery. > > > > > > Userspace for this can be found at: > > > https://github.com/ROCm/rocm-systems/pull/6190 > > > > > > Signed-off-by: Yifan Zhang <[email protected]> > > > --- > > > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 12 +++ > > > drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 + > > > drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 29 ++++++ > > > drivers/gpu/drm/amd/amdkfd/kfd_events.c | 118 +++++++++++++++++++++++- > > > include/uapi/drm/amdgpu_drm.h | 24 +++++ > > > 5 files changed, 182 insertions(+), 2 deletions(-) > > > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > > > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > > > index 5d7bfa59424a..0408476f1070 100644 > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > > > @@ -455,6 +455,16 @@ struct amdgpu_fpriv { > > > > > > /** GPU partition selection */ > > > uint32_t xcp_id; > > > + > > > + /** > > > + * @kfd_sigbus_delay_ms: Per-fd KFD SIGBUS delivery option (set > > > via > > > + * DRM_IOCTL_AMDGPU_USER_OPTIONS / > > AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY). > > > + * > > > + * 0 - send SIGBUS immediately (default) > > > + * 0xFFFF - suppress SIGBUS delivery > > > + * other - delay SIGBUS delivery by this many milliseconds > > > + */ > > > + atomic_t kfd_sigbus_delay_ms; > > > }; > > > > > > int amdgpu_file_to_fpriv(struct file *filp, struct amdgpu_fpriv > > > **fpriv); @@ -1467,6 +1477,8 @@ int amdgpu_enable_vblank_kms(struct > > > drm_crtc *crtc); void amdgpu_disable_vblank_kms(struct drm_crtc > > > *crtc); int amdgpu_info_ioctl(struct drm_device *dev, void *data, > > > struct drm_file *filp); > > > +int amdgpu_user_options_ioctl(struct drm_device *dev, void *data, > > > + struct drm_file *filp); > > > > > > /* > > > * functions used by amdgpu_encoder.c diff --git > > > a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > > > index 99688391e70b..cad18bd6f8b3 100644 > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > > > @@ -3078,6 +3078,7 @@ const struct drm_ioctl_desc amdgpu_ioctls_kms[] = { > > > DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_SIGNAL, > > amdgpu_userq_signal_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), > > > DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_WAIT, > > amdgpu_userq_wait_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), > > > DRM_IOCTL_DEF_DRV(AMDGPU_GEM_LIST_HANDLES, > > > amdgpu_gem_list_handles_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), > > > + DRM_IOCTL_DEF_DRV(AMDGPU_USER_OPTIONS, > > > + amdgpu_user_options_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), > > > }; > > > > > > static const struct drm_driver amdgpu_kms_driver = { diff --git > > > a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > > > index 24526e92f9b8..7903587b8bbb 100644 > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > > > @@ -1423,6 +1423,35 @@ int amdgpu_info_ioctl(struct drm_device *dev, void > > *data, struct drm_file *filp) > > > return 0; > > > } > > > > > > +/** > > > + * amdgpu_user_options_ioctl - set per-fd user options > > > + * > > > + * @dev: drm dev pointer > > > + * @data: pointer to struct drm_amdgpu_user_options > > > + * @filp: drm file > > > + * > > > + * Sets options stored on the per-file amdgpu_fpriv. Currently the > > > +only > > > + * supported option is > > %AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY which > > > + * controls how KFD delivers SIGBUS for poison/RAS events to the > > > +calling > > > + * process (immediate, suppressed, or delayed by N milliseconds). > > > + */ > > > +int amdgpu_user_options_ioctl(struct drm_device *dev, void *data, > > > + struct drm_file *filp) { > > > + struct amdgpu_fpriv *fpriv = filp->driver_priv; > > > + struct drm_amdgpu_user_options *args = data; > > > + > > > + switch (args->op) { > > > + case AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY: > > > + atomic_set(&fpriv->kfd_sigbus_delay_ms, > > > + args->kfd_sigbus_delay.value); > > > + return 0; > > > + default: > > > + DRM_DEBUG_KMS("Invalid user option op %u\n", args->op); > > > + return -EINVAL; > > > + } > > > +} > > > + > > > /** > > > * amdgpu_driver_open_kms - drm callback for open > > > * > > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c > > > b/drivers/gpu/drm/amd/amdkfd/kfd_events.c > > > index e9be798c0a2b..200570401f51 100644 > > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c > > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c > > > @@ -29,10 +29,12 @@ > > > #include <linux/uaccess.h> > > > #include <linux/mman.h> > > > #include <linux/memory.h> > > > +#include <linux/workqueue.h> > > > #include "kfd_priv.h" > > > #include "kfd_events.h" > > > #include "kfd_device_queue_manager.h" > > > #include <linux/device.h> > > > +#include <uapi/drm/amdgpu_drm.h> > > > > > > /* > > > * Wrapper around wait_queue_entry_t > > > @@ -1337,6 +1339,119 @@ void kfd_signal_reset_event(struct kfd_node *dev) > > > srcu_read_unlock(&kfd_processes_srcu, idx); } > > > > > > +/* > > > + * Per-process opt-in for poison-consumption SIGBUS handling. > > > + * > > > + * Default: kernel sends SIGBUS to the process immediately when > > > +poison is > > > + * consumed, in addition to delivering the KFD HW/MEMORY exception > > > events. > > > + * > > > + * Userspace (ROCr) can opt-in per-process via the > > > + * DRM_IOCTL_AMDGPU_USER_OPTIONS / > > > +AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY > > > + * option. This lets the app's registered system-event callback > > > +handle the > > > + * RAS error first, instead of being killed by SIGBUS. > > > + * > > > + * Encoded value (set on any of the process' amdgpu render fds): > > > + * 0 - default: SIGBUS immediately (no opt-in) > > > + * 0xFFFF - opt-in, never escalate to SIGBUS > > > + * N (other) - opt-in, escalate to SIGBUS after N ms if app does not > > > + * handle the error in time (safety timeout) > > > + * > > > + * Per-process scope: the option is honored if ANY of the process' > > > +amdgpu > > > + * fds has been configured. This matches the slide deck's > > > +"Per-process, > > > + * App set at init" semantics, while keeping the UAPI on amdgpu where > > > +ROCr > > > + * sets it. > > > + */ > > > +struct kfd_sigbus_delayed_work { > > > + struct delayed_work work; > > > + struct kfd_process *p; > > > +}; > > > + > > > +static void kfd_signal_sigbus_delayed_fn(struct work_struct *work) { > > > + struct kfd_sigbus_delayed_work *dw = > > > container_of(to_delayed_work(work), > > > + struct kfd_sigbus_delayed_work, work); > > > + struct kfd_process *p = dw->p; > > > + > > > + if (p->lead_thread) > > > + send_sig(SIGBUS, p->lead_thread, 0); > > > + > > > + kfd_unref_process(p); > > > + kfree(dw); > > > +} > > > + > > > +/* > > > + * Resolve the per-process SIGBUS opt-in setting by scanning all of > > > +the > > > + * process' KFD pdds (each backed by an amdgpu render fd). Returns > > > +the > > > + * "most lenient" value across all fds, in this priority: > > > + * DISABLED (no SIGBUS) > any non-zero timeout > 0 (immediate) > > > + * > > > + * Rationale: if the app has explicitly opted in on any GPU it uses, > > > +it > > > + * wants the chance to handle the error in userspace. > > > + */ > > > +static u16 kfd_get_sigbus_delay_ms(struct kfd_process *p) { > > > + u16 result = 0; > > > + int i; > > > + > > > + mutex_lock(&p->mutex); > > > + for (i = 0; i < p->n_pdds; i++) { > > > + struct kfd_process_device *pdd = p->pdds[i]; > > > + struct amdgpu_fpriv *drv_priv; > > > + u16 v; > > > + > > > + if (!pdd || !pdd->drm_file) > > > + continue; > > > + if (amdgpu_file_to_fpriv(pdd->drm_file, &drv_priv)) > > > + continue; > > > + > > > + v = atomic_read(&drv_priv->kfd_sigbus_delay_ms); > > > + if (v == > > AMDGPU_USER_OPTIONS_KFD_SIGBUS_DELAY_DISABLED) { > > > + result = v; > > > + break; > > > + } > > > + if (v > result) > > > + result = v; > > > + } > > > + mutex_unlock(&p->mutex); > > > + > > > + return result; > > > +} > > > + > > > +static void kfd_signal_sigbus_with_delay(struct kfd_node *dev, > > > + struct kfd_process *p) { > > > + u16 delay_ms = kfd_get_sigbus_delay_ms(p); > > > + struct kfd_sigbus_delayed_work *dw; > > > + > > > + if (delay_ms == > > AMDGPU_USER_OPTIONS_KFD_SIGBUS_DELAY_DISABLED) { > > > + dev_info(dev->adev->dev, > > > + "SIGBUS suppressed for process %s(pid:%d): app > > > opted in to > > handle RAS error\n", > > > + p->lead_thread->comm, p->lead_thread->pid); > > > + return; > > > + } > > > + > > > + if (delay_ms == 0) > > > + goto send_now; > > > + > > > + dw = kzalloc(sizeof(*dw), GFP_ATOMIC); > > > + if (!dw) > > > + goto send_now; > > > + > > > + /* Take an extra reference for the delayed worker. */ > > > + kref_get(&p->ref); > > > + dw->p = p; > > > + INIT_DELAYED_WORK(&dw->work, kfd_signal_sigbus_delayed_fn); > > > + > > > + dev_info(dev->adev->dev, > > > + "Deferring SIGBUS to process %s(pid:%d) by %u ms (RAS > > > error opt- > > in safety timeout)\n", > > > + p->lead_thread->comm, p->lead_thread->pid, delay_ms); > > > + schedule_delayed_work(&dw->work, msecs_to_jiffies(delay_ms)); > > > + return; > > > + > > > +send_now: > > > + send_sig(SIGBUS, p->lead_thread, 0); > > > > > > Probably worth adding a comment here that this feature is not supported with > > confidential compute. Other than that, looks good to me. > > > > Alex > > > > > +} > > > + > > > void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 > > > pasid) { > > > struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, > > > NULL); @@ -1345,7 +1460,6 @@ void > > kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid) > > > struct kfd_event *ev; > > > uint32_t id = KFD_FIRST_NONSIGNAL_EVENT_ID; > > > int user_gpu_id; > > > - > > > if (!p) { > > > dev_warn(dev->adev->dev, "Not find process with > > > pasid:%d\n", pasid); > > > return; /* Presumably process exited. */ @@ -1391,7 > > > +1505,7 @@ void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 > > pasid) > > > rcu_read_unlock(); > > > > > > /* user application will handle SIGBUS signal */ > > > - send_sig(SIGBUS, p->lead_thread, 0); > > > + kfd_signal_sigbus_with_delay(dev, p); > > > > > > kfd_unref_process(p); > > > } > > > diff --git a/include/uapi/drm/amdgpu_drm.h > > > b/include/uapi/drm/amdgpu_drm.h index 9f3090db2f16..e88d7cf53858 > > > 100644 > > > --- a/include/uapi/drm/amdgpu_drm.h > > > +++ b/include/uapi/drm/amdgpu_drm.h > > > @@ -58,6 +58,7 @@ extern "C" { > > > #define DRM_AMDGPU_USERQ_SIGNAL 0x17 > > > #define DRM_AMDGPU_USERQ_WAIT 0x18 > > > #define DRM_AMDGPU_GEM_LIST_HANDLES 0x19 > > > +#define DRM_AMDGPU_USER_OPTIONS 0x1A > > > > > > #define DRM_IOCTL_AMDGPU_GEM_CREATE > > DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_CREATE, union > > drm_amdgpu_gem_create) > > > #define DRM_IOCTL_AMDGPU_GEM_MMAP > > DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_MMAP, union > > drm_amdgpu_gem_mmap) > > > @@ -79,6 +80,7 @@ extern "C" { > > > #define DRM_IOCTL_AMDGPU_USERQ_SIGNAL > > DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USERQ_SIGNAL, struct > > drm_amdgpu_userq_signal) > > > #define DRM_IOCTL_AMDGPU_USERQ_WAIT > > DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USERQ_WAIT, struct > > drm_amdgpu_userq_wait) > > > #define DRM_IOCTL_AMDGPU_GEM_LIST_HANDLES > > DRM_IOWR(DRM_COMMAND_BASE + > > > DRM_AMDGPU_GEM_LIST_HANDLES, struct drm_amdgpu_gem_list_handles) > > > +#define DRM_IOCTL_AMDGPU_USER_OPTIONS > > DRM_IOWR(DRM_COMMAND_BASE + > > > +DRM_AMDGPU_USER_OPTIONS, struct drm_amdgpu_user_options) > > > > > > /** > > > * DOC: memory domains > > > @@ -1673,6 +1675,28 @@ struct drm_amdgpu_info_uq_metadata { > > > #define AMDGPU_FAMILY_GC_11_5_4 154 /* GC 11.5.4 > > > */ > > > #define AMDGPU_FAMILY_GC_12_0_0 152 /* GC 12.0.0 > > > */ > > > > > > +/* > > > + * Definition of user options > > > + * > > > + * option: AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY > > > + * 0: Disable sigbus delay - SIGBUS will be raised > > > immediately > > > + * 0xFFFF: SIGBUS will not be raised > > > + * other: Set the sigbus delay in milliseconds > > > + */ > > > +#define AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY 0 > > > + > > > +#define AMDGPU_USER_OPTIONS_KFD_SIGBUS_DELAY_DISABLED > > 0xFFFFu > > > + > > > +struct drm_amdgpu_user_options { > > > + __u32 op; > > > + union { > > > + struct { > > > + __u16 value; > > > + __u16 _pad; > > > + } kfd_sigbus_delay; > > > + }; > > > +}; > > > + > > > #if defined(__cplusplus) > > > } > > > #endif > > > -- > > > 2.43.0 > > >
