AMD General

ping

-----Original Message-----
From: Zhang, Yifan <[email protected]>
Sent: Thursday, May 21, 2026 5:05 PM
To: [email protected]
Cc: Kuehling, Felix <[email protected]>; Deucher, Alexander 
<[email protected]>; Koenig, Christian <[email protected]>; Yat 
Sin, David <[email protected]>; Lazar, Lijo <[email protected]>; Zhang, 
Yifan <[email protected]>
Subject: [PATCH v4] drm/amdgpu: add ioctl to handle RAS poison error

Add a new DRM_IOCTL_AMDGPU_USER_OPTIONS ioctl with the 
AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY option, allowing userspace (ROCr) to 
control per-process SIGBUS delivery.

Userspace for this can be found at:
https://github.com/ROCm/rocm-systems/pull/6190

Signed-off-by: Yifan Zhang <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  6 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c    | 27 +++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_events.c    | 70 +++++++++++++++++++++-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h      | 14 +++++
 drivers/gpu/drm/amd/amdkfd/kfd_process.c   | 23 +++++++
 include/uapi/drm/amdgpu_drm.h              | 21 +++++++
 8 files changed, 163 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 5d7bfa59424a..771ec0608270 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1467,6 +1467,8 @@ int amdgpu_enable_vblank_kms(struct drm_crtc *crtc);  
void amdgpu_disable_vblank_kms(struct drm_crtc *crtc);  int 
amdgpu_info_ioctl(struct drm_device *dev, void *data,
                      struct drm_file *filp);
+int amdgpu_user_options_ioctl(struct drm_device *dev, void *data,
+                             struct drm_file *filp);

 /*
  * functions used by amdgpu_encoder.c
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 5333e052d56d..68d83a6e6b3a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -210,6 +210,7 @@ int amdgpu_amdkfd_evict_userptr(struct 
mmu_interval_notifier *mni,  int amdgpu_amdkfd_bo_validate_and_fence(struct 
amdgpu_bo *bo,
                                        uint32_t domain,
                                        struct dma_fence *fence);
+int amdgpu_amdkfd_set_sigbus_delay(struct task_struct *task, u32 ms);
 #else
 static inline
 bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct *mm) @@ 
-241,6 +242,11 @@ int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo, 
 {
        return 0;
 }
+static inline
+int amdgpu_amdkfd_set_sigbus_delay(struct task_struct *task, u32 ms) {
+       return -EOPNOTSUPP;
+}
 #endif
 /* Shared API */
 int amdgpu_amdkfd_alloc_kernel_mem(struct amdgpu_device *adev, size_t size, 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 1781c0c3d010..4d4d21babc61 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -3076,6 +3076,7 @@ const struct drm_ioctl_desc amdgpu_ioctls_kms[] = {
        DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_SIGNAL, amdgpu_userq_signal_ioctl, 
DRM_AUTH|DRM_RENDER_ALLOW),
        DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_WAIT, amdgpu_userq_wait_ioctl, 
DRM_AUTH|DRM_RENDER_ALLOW),
        DRM_IOCTL_DEF_DRV(AMDGPU_GEM_LIST_HANDLES, 
amdgpu_gem_list_handles_ioctl, DRM_AUTH|DRM_RENDER_ALLOW),
+       DRM_IOCTL_DEF_DRV(AMDGPU_USER_OPTIONS, amdgpu_user_options_ioctl,
+DRM_AUTH|DRM_RENDER_ALLOW),
 };

 static const struct drm_driver amdgpu_kms_driver = { diff --git 
a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 24526e92f9b8..772e0fda7e14 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -1423,6 +1423,33 @@ int amdgpu_info_ioctl(struct drm_device *dev, void 
*data, struct drm_file *filp)
        return 0;
 }

+/**
+ * amdgpu_user_options_ioctl - set per-fd user options
+ *
+ * @dev: drm dev pointer
+ * @data: pointer to struct drm_amdgpu_user_options
+ * @filp: drm file
+ *
+ * Sets options stored on the per-file amdgpu_fpriv. Currently the only
+ * supported option is %AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY which
+ * controls how KFD delivers SIGBUS for poison/RAS events to the
+calling
+ * process (immediate, suppressed, or delayed by N milliseconds).
+ */
+int amdgpu_user_options_ioctl(struct drm_device *dev, void *data,
+                             struct drm_file *filp)
+{
+       struct drm_amdgpu_user_options *args = data;
+
+       switch (args->op) {
+       case AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY:
+               return amdgpu_amdkfd_set_sigbus_delay(current,
+                                                     
args->kfd_sigbus_delay.value);
+       default:
+               DRM_DEBUG_KMS("Invalid user option op %u\n", args->op);
+               return -EINVAL;
+       }
+}
+
 /**
  * amdgpu_driver_open_kms - drm callback for open
  *
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index e9be798c0a2b..e7d70e3a7f3e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -29,10 +29,12 @@
 #include <linux/uaccess.h>
 #include <linux/mman.h>
 #include <linux/memory.h>
+#include <linux/workqueue.h>
 #include "kfd_priv.h"
 #include "kfd_events.h"
 #include "kfd_device_queue_manager.h"
 #include <linux/device.h>
+#include <uapi/drm/amdgpu_drm.h>

 /*
  * Wrapper around wait_queue_entry_t
@@ -1337,6 +1339,72 @@ void kfd_signal_reset_event(struct kfd_node *dev)
        srcu_read_unlock(&kfd_processes_srcu, idx);  }

+/*
+ * Per-process opt-in for poison-consumption SIGBUS handling.
+ *
+ * Default: kernel sends SIGBUS to the process immediately when poison
+is
+ * consumed, in addition to delivering the KFD HW/MEMORY exception events.
+ *
+ * Userspace (ROCr) can opt-in per-process via the
+ * DRM_IOCTL_AMDGPU_USER_OPTIONS /
+AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY
+ * option. This lets the app's registered system-event callback handle
+the
+ * RAS error first, instead of being killed by SIGBUS.
+ *
+ * Encoded value (stored on the kfd_process):
+ *   0          - default: SIGBUS immediately (no opt-in)
+ *   0xFFFFFFFF - opt-in, never escalate to SIGBUS
+ *   N (other)  - opt-in, escalate to SIGBUS after N ms if app does not
+ *                handle the error in time (safety timeout)
+ */
+
+static void kfd_signal_sigbus_delayed_fn(struct work_struct *work) {
+       struct kfd_process_device *pdd = container_of(to_delayed_work(work),
+                               struct kfd_process_device, work);
+       struct kfd_process *p = pdd->process;
+
+       if (p->lead_thread)
+               send_sig(SIGBUS, p->lead_thread, 0);
+
+       kfd_unref_process(p);
+}
+
+static void kfd_signal_sigbus_with_delay(struct kfd_node *dev,
+                                        struct kfd_process *p)
+{
+       struct kfd_process_device *pdd;
+       u32 delay_ms = atomic_read(&p->kfd_sigbus_delay_ms);
+
+       if (delay_ms == AMDGPU_USER_OPTIONS_KFD_SIGBUS_DELAY_DISABLED) {
+               dev_info(dev->adev->dev,
+                        "SIGBUS suppressed for process %s(pid:%d): app opted 
in to handle RAS error\n",
+                        p->lead_thread->comm, p->lead_thread->pid);
+               return;
+       }
+
+       if (delay_ms == 0)
+               goto send_now;
+
+       pdd = kfd_get_process_device_data(dev, p);
+       if (!pdd) {
+               dev_err(dev->adev->dev, "Process device data doesn't exist\n");
+               goto send_now;
+       }
+
+       /* Take an extra reference for the delayed worker. */
+       kref_get(&p->ref);
+       INIT_DELAYED_WORK(&pdd->work, kfd_signal_sigbus_delayed_fn);
+
+       dev_info(dev->adev->dev,
+                "Deferring SIGBUS to process %s(pid:%d) by %u ms (RAS error 
opt-in safety timeout)\n",
+                p->lead_thread->comm, p->lead_thread->pid, delay_ms);
+       schedule_delayed_work(&pdd->work, msecs_to_jiffies(delay_ms));
+       return;
+
+send_now:
+       send_sig(SIGBUS, p->lead_thread, 0);
+}
+
 void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid)  {
        struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL); @@ 
-1391,7 +1459,7 @@ void kfd_signal_poison_consumed_event(struct kfd_node *dev, 
u32 pasid)
        rcu_read_unlock();

        /* user application will handle SIGBUS signal */
-       send_sig(SIGBUS, p->lead_thread, 0);
+       kfd_signal_sigbus_with_delay(dev, p);

        kfd_unref_process(p);
 }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index f037062c33ea..d3fcf07c0ebe 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -875,6 +875,9 @@ struct kfd_process_device {
        u32 pasid;
        /* Indicates this process has requested PTL stay disabled */
        bool ptl_disable_req;
+
+       /* Delayed signal to user */
+       struct delayed_work work;
 };

 #define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd) @@ 
-957,6 +960,17 @@ struct kfd_process {
        size_t signal_event_count;
        bool signal_event_limit_reached;

+       /**
+        * @kfd_sigbus_delay_ms: Per-process KFD SIGBUS delivery option for
+        * poison/RAS events (set via DRM_IOCTL_AMDGPU_USER_OPTIONS /
+        * AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY).
+        *
+        *   0          - send SIGBUS immediately (default)
+        *   0xFFFFFFFF - suppress SIGBUS delivery
+        *   other      - delay SIGBUS delivery by this many milliseconds
+        */
+       atomic_t kfd_sigbus_delay_ms;
+
        /* Information used for memory eviction */
        void *kgd_process_info;
        /* Eviction fence that is attached to all the BOs of this process. The 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 419bb8086ccd..dadb7cf7b072 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -992,6 +992,29 @@ struct kfd_process *kfd_create_process(struct task_struct 
*thread)
        return process;
 }

+/**
+ * amdgpu_amdkfd_set_sigbus_delay - Set per-process KFD SIGBUS delay
+ * @task: task in the target process
+ * @ms:   encoded delay value (0 = immediate, 0xFFFFFFFF = suppress,
+ *        otherwise delay in milliseconds)
+ *
+ * Stores the SIGBUS delivery option on the kfd_process associated with
+ * @task. If no kfd_process exists yet, one is created so the option
+ * persists until poison/RAS events are signaled.
+ */
+int amdgpu_amdkfd_set_sigbus_delay(struct task_struct *task, u32 ms) {
+       struct kfd_process *p;
+
+       p = kfd_create_process(task);
+       if (IS_ERR(p))
+               return PTR_ERR(p);
+
+       atomic_set(&p->kfd_sigbus_delay_ms, ms);
+       kfd_unref_process(p);
+       return 0;
+}
+
 static struct kfd_process *find_process_by_mm(const struct mm_struct *mm)  {
        struct kfd_process *process;
diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h 
index 9f3090db2f16..ab71c4b4aeac 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -58,6 +58,7 @@ extern "C" {
 #define DRM_AMDGPU_USERQ_SIGNAL                0x17
 #define DRM_AMDGPU_USERQ_WAIT          0x18
 #define DRM_AMDGPU_GEM_LIST_HANDLES    0x19
+#define DRM_AMDGPU_USER_OPTIONS                0x1A

 #define DRM_IOCTL_AMDGPU_GEM_CREATE    DRM_IOWR(DRM_COMMAND_BASE + 
DRM_AMDGPU_GEM_CREATE, union drm_amdgpu_gem_create)
 #define DRM_IOCTL_AMDGPU_GEM_MMAP      DRM_IOWR(DRM_COMMAND_BASE + 
DRM_AMDGPU_GEM_MMAP, union drm_amdgpu_gem_mmap)
@@ -79,6 +80,7 @@ extern "C" {
 #define DRM_IOCTL_AMDGPU_USERQ_SIGNAL  DRM_IOWR(DRM_COMMAND_BASE + 
DRM_AMDGPU_USERQ_SIGNAL, struct drm_amdgpu_userq_signal)
 #define DRM_IOCTL_AMDGPU_USERQ_WAIT    DRM_IOWR(DRM_COMMAND_BASE + 
DRM_AMDGPU_USERQ_WAIT, struct drm_amdgpu_userq_wait)
 #define DRM_IOCTL_AMDGPU_GEM_LIST_HANDLES DRM_IOWR(DRM_COMMAND_BASE + 
DRM_AMDGPU_GEM_LIST_HANDLES, struct drm_amdgpu_gem_list_handles)
+#define DRM_IOCTL_AMDGPU_USER_OPTIONS  DRM_IOWR(DRM_COMMAND_BASE + 
DRM_AMDGPU_USER_OPTIONS, struct drm_amdgpu_user_options)

 /**
  * DOC: memory domains
@@ -1673,6 +1675,25 @@ struct drm_amdgpu_info_uq_metadata {
 #define AMDGPU_FAMILY_GC_11_5_4                        154 /* GC 11.5.4 */
 #define AMDGPU_FAMILY_GC_12_0_0                        152 /* GC 12.0.0 */

+/*
+ * Definition of user options
+ *
+ * option: AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY
+ *    0:          Disable sigbus delay - SIGBUS will be raised immediately
+ *    0xFFFFFFFF: SIGBUS will not be raised
+ *    other:      Set the sigbus delay in milliseconds
+ */
+#define AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY                0
+
+#define AMDGPU_USER_OPTIONS_KFD_SIGBUS_DELAY_DISABLED  0xFFFFFFFFu
+
+struct drm_amdgpu_user_options {
+       __u32 op;
+       struct {
+               __u32 value;
+       } kfd_sigbus_delay;
+};
+
 #if defined(__cplusplus)
 }
 #endif
--
2.43.0

Reply via email to