-----Original Message-----
From: amd-gfx <[email protected]> On Behalf Of Zhang,
Yifan
Sent: May 24, 2026 7:18 AM
To: [email protected]
Cc: Kuehling, Felix <[email protected]>; Deucher, Alexander
<[email protected]>; Koenig, Christian
<[email protected]>; Yat Sin, David <[email protected]>; Lazar,
Lijo <[email protected]>
Subject: RE: [PATCH v4] drm/amdgpu: add ioctl to handle RAS poison error
AMD General
ping
-----Original Message-----
From: Zhang, Yifan <[email protected]>
Sent: Thursday, May 21, 2026 5:05 PM
To: [email protected]
Cc: Kuehling, Felix <[email protected]>; Deucher, Alexander
<[email protected]>; Koenig, Christian
<[email protected]>; Yat Sin, David <[email protected]>; Lazar,
Lijo <[email protected]>; Zhang, Yifan <[email protected]>
Subject: [PATCH v4] drm/amdgpu: add ioctl to handle RAS poison error
Add a new DRM_IOCTL_AMDGPU_USER_OPTIONS ioctl with the
AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY option, allowing userspace
(ROCr) to control per-process SIGBUS delivery.
Userspace for this can be found at:
https://github.com/ROCm/rocm-systems/pull/6190
Signed-off-by: Yifan Zhang <[email protected]>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 +
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 6 ++
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 27 +++++++++
drivers/gpu/drm/amd/amdkfd/kfd_events.c | 70 +++++++++++++++++++++-
drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 14 +++++
drivers/gpu/drm/amd/amdkfd/kfd_process.c | 23 +++++++
include/uapi/drm/amdgpu_drm.h | 21 +++++++
8 files changed, 163 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 5d7bfa59424a..771ec0608270 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1467,6 +1467,8 @@ int amdgpu_enable_vblank_kms(struct drm_crtc
*crtc); void amdgpu_disable_vblank_kms(struct drm_crtc *crtc); int
amdgpu_info_ioctl(struct drm_device *dev, void *data,
struct drm_file *filp);
+int amdgpu_user_options_ioctl(struct drm_device *dev, void *data,
+ struct drm_file *filp);
/*
* functions used by amdgpu_encoder.c
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 5333e052d56d..68d83a6e6b3a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -210,6 +210,7 @@ int amdgpu_amdkfd_evict_userptr(struct
mmu_interval_notifier *mni, int amdgpu_amdkfd_bo_validate_and_fence(struct
amdgpu_bo *bo,
uint32_t domain,
struct dma_fence *fence);
+int amdgpu_amdkfd_set_sigbus_delay(struct task_struct *task, u32 ms);
#else
static inline
bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct *mm)
@@ -241,6 +242,11 @@ int amdgpu_amdkfd_bo_validate_and_fence(struct
amdgpu_bo *bo, {
return 0;
}
+static inline
+int amdgpu_amdkfd_set_sigbus_delay(struct task_struct *task, u32 ms) {
+ return -EOPNOTSUPP;
+}
#endif
/* Shared API */
int amdgpu_amdkfd_alloc_kernel_mem(struct amdgpu_device *adev, size_t size,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 1781c0c3d010..4d4d21babc61 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -3076,6 +3076,7 @@ const struct drm_ioctl_desc amdgpu_ioctls_kms[] = {
DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_SIGNAL,
amdgpu_userq_signal_ioctl, DRM_AUTH|DRM_RENDER_ALLOW),
DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_WAIT, amdgpu_userq_wait_ioctl,
DRM_AUTH|DRM_RENDER_ALLOW),
DRM_IOCTL_DEF_DRV(AMDGPU_GEM_LIST_HANDLES,
amdgpu_gem_list_handles_ioctl, DRM_AUTH|DRM_RENDER_ALLOW),
+ DRM_IOCTL_DEF_DRV(AMDGPU_USER_OPTIONS,
amdgpu_user_options_ioctl,
+DRM_AUTH|DRM_RENDER_ALLOW),
};
static const struct drm_driver amdgpu_kms_driver = { diff --git
a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 24526e92f9b8..772e0fda7e14 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -1423,6 +1423,33 @@ int amdgpu_info_ioctl(struct drm_device *dev, void
*data, struct drm_file *filp)
return 0;
}
+/**
+ * amdgpu_user_options_ioctl - set per-fd user options
+ *
+ * @dev: drm dev pointer
+ * @data: pointer to struct drm_amdgpu_user_options
+ * @filp: drm file
+ *
+ * Sets options stored on the per-file amdgpu_fpriv. Currently the only
+ * supported option is %AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY
which
+ * controls how KFD delivers SIGBUS for poison/RAS events to the
+calling
+ * process (immediate, suppressed, or delayed by N milliseconds).
+ */
+int amdgpu_user_options_ioctl(struct drm_device *dev, void *data,
+ struct drm_file *filp)
+{
+ struct drm_amdgpu_user_options *args = data;
+
+ switch (args->op) {
+ case AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY:
+ return amdgpu_amdkfd_set_sigbus_delay(current,
+
args->kfd_sigbus_delay.value);
+ default:
+ DRM_DEBUG_KMS("Invalid user option op %u\n", args->op);
+ return -EINVAL;
+ }
+}
+
/**
* amdgpu_driver_open_kms - drm callback for open
*
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index e9be798c0a2b..e7d70e3a7f3e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -29,10 +29,12 @@
#include <linux/uaccess.h>
#include <linux/mman.h>
#include <linux/memory.h>
+#include <linux/workqueue.h>
#include "kfd_priv.h"
#include "kfd_events.h"
#include "kfd_device_queue_manager.h"
#include <linux/device.h>
+#include <uapi/drm/amdgpu_drm.h>
/*
* Wrapper around wait_queue_entry_t
@@ -1337,6 +1339,72 @@ void kfd_signal_reset_event(struct kfd_node *dev)
srcu_read_unlock(&kfd_processes_srcu, idx); }
+/*
+ * Per-process opt-in for poison-consumption SIGBUS handling.
+ *
+ * Default: kernel sends SIGBUS to the process immediately when poison
+is
+ * consumed, in addition to delivering the KFD HW/MEMORY exception events.
+ *
+ * Userspace (ROCr) can opt-in per-process via the
+ * DRM_IOCTL_AMDGPU_USER_OPTIONS /
+AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY
+ * option. This lets the app's registered system-event callback handle
+the
+ * RAS error first, instead of being killed by SIGBUS.
+ *
+ * Encoded value (stored on the kfd_process):
+ * 0 - default: SIGBUS immediately (no opt-in)
+ * 0xFFFFFFFF - opt-in, never escalate to SIGBUS
+ * N (other) - opt-in, escalate to SIGBUS after N ms if app does not
+ * handle the error in time (safety timeout)
+ */
+
+static void kfd_signal_sigbus_delayed_fn(struct work_struct *work) {
+ struct kfd_process_device *pdd = container_of(to_delayed_work(work),
+ struct kfd_process_device, work);
+ struct kfd_process *p = pdd->process;
+
+ if (p->lead_thread)
+ send_sig(SIGBUS, p->lead_thread, 0);
+
+ kfd_unref_process(p);
+}
+
+static void kfd_signal_sigbus_with_delay(struct kfd_node *dev,
+ struct kfd_process *p)
+{
+ struct kfd_process_device *pdd;
+ u32 delay_ms = atomic_read(&p->kfd_sigbus_delay_ms);
+
+ if (delay_ms == AMDGPU_USER_OPTIONS_KFD_SIGBUS_DELAY_DISABLED)
{
+ dev_info(dev->adev->dev,
+ "SIGBUS suppressed for process %s(pid:%d): app opted
in to handle
RAS error\n",
+ p->lead_thread->comm, p->lead_thread->pid);
+ return;
+ }
+
+ if (delay_ms == 0)
+ goto send_now;
+
+ pdd = kfd_get_process_device_data(dev, p);
+ if (!pdd) {
+ dev_err(dev->adev->dev, "Process device data doesn't exist\n");
+ goto send_now;
+ }
+
+ /* Take an extra reference for the delayed worker. */
+ kref_get(&p->ref);
+ INIT_DELAYED_WORK(&pdd->work, kfd_signal_sigbus_delayed_fn);
+
+ dev_info(dev->adev->dev,
+ "Deferring SIGBUS to process %s(pid:%d) by %u ms (RAS error
opt-in
safety timeout)\n",
+ p->lead_thread->comm, p->lead_thread->pid, delay_ms);
+ schedule_delayed_work(&pdd->work, msecs_to_jiffies(delay_ms));