AMD General

Seems fine to me. Will wait for Lijo and David to chime in on their v2/v3 
change requests before I give a Reviewed-By

 Kent

> -----Original Message-----
> From: amd-gfx <[email protected]> On Behalf Of Zhang,
> Yifan
> Sent: May 24, 2026 7:18 AM
> To: [email protected]
> Cc: Kuehling, Felix <[email protected]>; Deucher, Alexander
> <[email protected]>; Koenig, Christian
> <[email protected]>; Yat Sin, David <[email protected]>; Lazar,
> Lijo <[email protected]>
> Subject: RE: [PATCH v4] drm/amdgpu: add ioctl to handle RAS poison error
>
> AMD General
>
> ping
>
> -----Original Message-----
> From: Zhang, Yifan <[email protected]>
> Sent: Thursday, May 21, 2026 5:05 PM
> To: [email protected]
> Cc: Kuehling, Felix <[email protected]>; Deucher, Alexander
> <[email protected]>; Koenig, Christian
> <[email protected]>; Yat Sin, David <[email protected]>; Lazar,
> Lijo <[email protected]>; Zhang, Yifan <[email protected]>
> Subject: [PATCH v4] drm/amdgpu: add ioctl to handle RAS poison error
>
> Add a new DRM_IOCTL_AMDGPU_USER_OPTIONS ioctl with the
> AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY option, allowing userspace
> (ROCr) to control per-process SIGBUS delivery.
>
> Userspace for this can be found at:
> https://github.com/ROCm/rocm-systems/pull/6190
>
> Signed-off-by: Yifan Zhang <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  6 ++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c    | 27 +++++++++
>  drivers/gpu/drm/amd/amdkfd/kfd_events.c    | 70 +++++++++++++++++++++-
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h      | 14 +++++
>  drivers/gpu/drm/amd/amdkfd/kfd_process.c   | 23 +++++++
>  include/uapi/drm/amdgpu_drm.h              | 21 +++++++
>  8 files changed, 163 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 5d7bfa59424a..771ec0608270 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1467,6 +1467,8 @@ int amdgpu_enable_vblank_kms(struct drm_crtc
> *crtc);  void amdgpu_disable_vblank_kms(struct drm_crtc *crtc);  int
> amdgpu_info_ioctl(struct drm_device *dev, void *data,
>                       struct drm_file *filp);
> +int amdgpu_user_options_ioctl(struct drm_device *dev, void *data,
> +                             struct drm_file *filp);
>
>  /*
>   * functions used by amdgpu_encoder.c
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index 5333e052d56d..68d83a6e6b3a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -210,6 +210,7 @@ int amdgpu_amdkfd_evict_userptr(struct
> mmu_interval_notifier *mni,  int amdgpu_amdkfd_bo_validate_and_fence(struct
> amdgpu_bo *bo,
>                                         uint32_t domain,
>                                         struct dma_fence *fence);
> +int amdgpu_amdkfd_set_sigbus_delay(struct task_struct *task, u32 ms);
>  #else
>  static inline
>  bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct *mm)
> @@ -241,6 +242,11 @@ int amdgpu_amdkfd_bo_validate_and_fence(struct
> amdgpu_bo *bo,  {
>         return 0;
>  }
> +static inline
> +int amdgpu_amdkfd_set_sigbus_delay(struct task_struct *task, u32 ms) {
> +       return -EOPNOTSUPP;
> +}
>  #endif
>  /* Shared API */
>  int amdgpu_amdkfd_alloc_kernel_mem(struct amdgpu_device *adev, size_t size,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 1781c0c3d010..4d4d21babc61 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -3076,6 +3076,7 @@ const struct drm_ioctl_desc amdgpu_ioctls_kms[] = {
>         DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_SIGNAL,
> amdgpu_userq_signal_ioctl, DRM_AUTH|DRM_RENDER_ALLOW),
>         DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_WAIT, amdgpu_userq_wait_ioctl,
> DRM_AUTH|DRM_RENDER_ALLOW),
>         DRM_IOCTL_DEF_DRV(AMDGPU_GEM_LIST_HANDLES,
> amdgpu_gem_list_handles_ioctl, DRM_AUTH|DRM_RENDER_ALLOW),
> +       DRM_IOCTL_DEF_DRV(AMDGPU_USER_OPTIONS,
> amdgpu_user_options_ioctl,
> +DRM_AUTH|DRM_RENDER_ALLOW),
>  };
>
>  static const struct drm_driver amdgpu_kms_driver = { diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> index 24526e92f9b8..772e0fda7e14 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> @@ -1423,6 +1423,33 @@ int amdgpu_info_ioctl(struct drm_device *dev, void
> *data, struct drm_file *filp)
>         return 0;
>  }
>
> +/**
> + * amdgpu_user_options_ioctl - set per-fd user options
> + *
> + * @dev: drm dev pointer
> + * @data: pointer to struct drm_amdgpu_user_options
> + * @filp: drm file
> + *
> + * Sets options stored on the per-file amdgpu_fpriv. Currently the only
> + * supported option is %AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY
> which
> + * controls how KFD delivers SIGBUS for poison/RAS events to the
> +calling
> + * process (immediate, suppressed, or delayed by N milliseconds).
> + */
> +int amdgpu_user_options_ioctl(struct drm_device *dev, void *data,
> +                             struct drm_file *filp)
> +{
> +       struct drm_amdgpu_user_options *args = data;
> +
> +       switch (args->op) {
> +       case AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY:
> +               return amdgpu_amdkfd_set_sigbus_delay(current,
> +                                                     
> args->kfd_sigbus_delay.value);
> +       default:
> +               DRM_DEBUG_KMS("Invalid user option op %u\n", args->op);
> +               return -EINVAL;
> +       }
> +}
> +
>  /**
>   * amdgpu_driver_open_kms - drm callback for open
>   *
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> index e9be798c0a2b..e7d70e3a7f3e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
> @@ -29,10 +29,12 @@
>  #include <linux/uaccess.h>
>  #include <linux/mman.h>
>  #include <linux/memory.h>
> +#include <linux/workqueue.h>
>  #include "kfd_priv.h"
>  #include "kfd_events.h"
>  #include "kfd_device_queue_manager.h"
>  #include <linux/device.h>
> +#include <uapi/drm/amdgpu_drm.h>
>
>  /*
>   * Wrapper around wait_queue_entry_t
> @@ -1337,6 +1339,72 @@ void kfd_signal_reset_event(struct kfd_node *dev)
>         srcu_read_unlock(&kfd_processes_srcu, idx);  }
>
> +/*
> + * Per-process opt-in for poison-consumption SIGBUS handling.
> + *
> + * Default: kernel sends SIGBUS to the process immediately when poison
> +is
> + * consumed, in addition to delivering the KFD HW/MEMORY exception events.
> + *
> + * Userspace (ROCr) can opt-in per-process via the
> + * DRM_IOCTL_AMDGPU_USER_OPTIONS /
> +AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY
> + * option. This lets the app's registered system-event callback handle
> +the
> + * RAS error first, instead of being killed by SIGBUS.
> + *
> + * Encoded value (stored on the kfd_process):
> + *   0          - default: SIGBUS immediately (no opt-in)
> + *   0xFFFFFFFF - opt-in, never escalate to SIGBUS
> + *   N (other)  - opt-in, escalate to SIGBUS after N ms if app does not
> + *                handle the error in time (safety timeout)
> + */
> +
> +static void kfd_signal_sigbus_delayed_fn(struct work_struct *work) {
> +       struct kfd_process_device *pdd = container_of(to_delayed_work(work),
> +                               struct kfd_process_device, work);
> +       struct kfd_process *p = pdd->process;
> +
> +       if (p->lead_thread)
> +               send_sig(SIGBUS, p->lead_thread, 0);
> +
> +       kfd_unref_process(p);
> +}
> +
> +static void kfd_signal_sigbus_with_delay(struct kfd_node *dev,
> +                                        struct kfd_process *p)
> +{
> +       struct kfd_process_device *pdd;
> +       u32 delay_ms = atomic_read(&p->kfd_sigbus_delay_ms);
> +
> +       if (delay_ms == AMDGPU_USER_OPTIONS_KFD_SIGBUS_DELAY_DISABLED)
> {
> +               dev_info(dev->adev->dev,
> +                        "SIGBUS suppressed for process %s(pid:%d): app opted 
> in to handle
> RAS error\n",
> +                        p->lead_thread->comm, p->lead_thread->pid);
> +               return;
> +       }
> +
> +       if (delay_ms == 0)
> +               goto send_now;
> +
> +       pdd = kfd_get_process_device_data(dev, p);
> +       if (!pdd) {
> +               dev_err(dev->adev->dev, "Process device data doesn't 
> exist\n");
> +               goto send_now;
> +       }
> +
> +       /* Take an extra reference for the delayed worker. */
> +       kref_get(&p->ref);
> +       INIT_DELAYED_WORK(&pdd->work, kfd_signal_sigbus_delayed_fn);
> +
> +       dev_info(dev->adev->dev,
> +                "Deferring SIGBUS to process %s(pid:%d) by %u ms (RAS error 
> opt-in
> safety timeout)\n",
> +                p->lead_thread->comm, p->lead_thread->pid, delay_ms);
> +       schedule_delayed_work(&pdd->work, msecs_to_jiffies(delay_ms));
> +       return;
> +
> +send_now:
> +       send_sig(SIGBUS, p->lead_thread, 0);
> +}
> +
>  void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid)  {
>         struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL); @@ -
> 1391,7 +1459,7 @@ void kfd_signal_poison_consumed_event(struct kfd_node
> *dev, u32 pasid)
>         rcu_read_unlock();
>
>         /* user application will handle SIGBUS signal */
> -       send_sig(SIGBUS, p->lead_thread, 0);
> +       kfd_signal_sigbus_with_delay(dev, p);
>
>         kfd_unref_process(p);
>  }
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index f037062c33ea..d3fcf07c0ebe 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -875,6 +875,9 @@ struct kfd_process_device {
>         u32 pasid;
>         /* Indicates this process has requested PTL stay disabled */
>         bool ptl_disable_req;
> +
> +       /* Delayed signal to user */
> +       struct delayed_work work;
>  };
>
>  #define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd) @@ -
> 957,6 +960,17 @@ struct kfd_process {
>         size_t signal_event_count;
>         bool signal_event_limit_reached;
>
> +       /**
> +        * @kfd_sigbus_delay_ms: Per-process KFD SIGBUS delivery option for
> +        * poison/RAS events (set via DRM_IOCTL_AMDGPU_USER_OPTIONS /
> +        * AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY).
> +        *
> +        *   0          - send SIGBUS immediately (default)
> +        *   0xFFFFFFFF - suppress SIGBUS delivery
> +        *   other      - delay SIGBUS delivery by this many milliseconds
> +        */
> +       atomic_t kfd_sigbus_delay_ms;
> +
>         /* Information used for memory eviction */
>         void *kgd_process_info;
>         /* Eviction fence that is attached to all the BOs of this process. 
> The diff --git
> a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 419bb8086ccd..dadb7cf7b072 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -992,6 +992,29 @@ struct kfd_process *kfd_create_process(struct
> task_struct *thread)
>         return process;
>  }
>
> +/**
> + * amdgpu_amdkfd_set_sigbus_delay - Set per-process KFD SIGBUS delay
> + * @task: task in the target process
> + * @ms:   encoded delay value (0 = immediate, 0xFFFFFFFF = suppress,
> + *        otherwise delay in milliseconds)
> + *
> + * Stores the SIGBUS delivery option on the kfd_process associated with
> + * @task. If no kfd_process exists yet, one is created so the option
> + * persists until poison/RAS events are signaled.
> + */
> +int amdgpu_amdkfd_set_sigbus_delay(struct task_struct *task, u32 ms) {
> +       struct kfd_process *p;
> +
> +       p = kfd_create_process(task);
> +       if (IS_ERR(p))
> +               return PTR_ERR(p);
> +
> +       atomic_set(&p->kfd_sigbus_delay_ms, ms);
> +       kfd_unref_process(p);
> +       return 0;
> +}
> +
>  static struct kfd_process *find_process_by_mm(const struct mm_struct *mm)  {
>         struct kfd_process *process;
> diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
> index 9f3090db2f16..ab71c4b4aeac 100644
> --- a/include/uapi/drm/amdgpu_drm.h
> +++ b/include/uapi/drm/amdgpu_drm.h
> @@ -58,6 +58,7 @@ extern "C" {
>  #define DRM_AMDGPU_USERQ_SIGNAL                0x17
>  #define DRM_AMDGPU_USERQ_WAIT          0x18
>  #define DRM_AMDGPU_GEM_LIST_HANDLES    0x19
> +#define DRM_AMDGPU_USER_OPTIONS                0x1A
>
>  #define DRM_IOCTL_AMDGPU_GEM_CREATE
> DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_CREATE, union
> drm_amdgpu_gem_create)
>  #define DRM_IOCTL_AMDGPU_GEM_MMAP
> DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_MMAP, union
> drm_amdgpu_gem_mmap)
> @@ -79,6 +80,7 @@ extern "C" {
>  #define DRM_IOCTL_AMDGPU_USERQ_SIGNAL
> DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USERQ_SIGNAL, struct
> drm_amdgpu_userq_signal)
>  #define DRM_IOCTL_AMDGPU_USERQ_WAIT
> DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USERQ_WAIT, struct
> drm_amdgpu_userq_wait)
>  #define DRM_IOCTL_AMDGPU_GEM_LIST_HANDLES
> DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_LIST_HANDLES,
> struct drm_amdgpu_gem_list_handles)
> +#define DRM_IOCTL_AMDGPU_USER_OPTIONS
> DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USER_OPTIONS, struct
> drm_amdgpu_user_options)
>
>  /**
>   * DOC: memory domains
> @@ -1673,6 +1675,25 @@ struct drm_amdgpu_info_uq_metadata {
>  #define AMDGPU_FAMILY_GC_11_5_4                        154 /* GC 11.5.4 */
>  #define AMDGPU_FAMILY_GC_12_0_0                        152 /* GC 12.0.0 */
>
> +/*
> + * Definition of user options
> + *
> + * option: AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY
> + *    0:          Disable sigbus delay - SIGBUS will be raised immediately
> + *    0xFFFFFFFF: SIGBUS will not be raised
> + *    other:      Set the sigbus delay in milliseconds
> + */
> +#define AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY                0
> +
> +#define AMDGPU_USER_OPTIONS_KFD_SIGBUS_DELAY_DISABLED
> 0xFFFFFFFFu
> +
> +struct drm_amdgpu_user_options {
> +       __u32 op;
> +       struct {
> +               __u32 value;
> +       } kfd_sigbus_delay;
> +};
> +
>  #if defined(__cplusplus)
>  }
>  #endif
> --
> 2.43.0

Reply via email to