Add a new DRM_IOCTL_AMDGPU_USER_OPTIONS ioctl with the
AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY option, allowing userspace (ROCr)
to control per-process SIGBUS delivery.
Userspace for this can be found at:
https://github.com/ROCm/rocm-systems/pull/6148
Signed-off-by: Yifan Zhang <[email protected]>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 12 +++
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 29 ++++++
drivers/gpu/drm/amd/amdkfd/kfd_events.c | 114 +++++++++++++++++++++++-
include/uapi/drm/amdgpu_drm.h | 25 ++++++
5 files changed, 179 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 5d7bfa59424a..6a5459b59af2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -455,6 +455,16 @@ struct amdgpu_fpriv {
/** GPU partition selection */
uint32_t xcp_id;
+
+ /**
+ * @kfd_sigbus_delay_ms: Per-fd KFD SIGBUS delivery option (set via
+ * DRM_IOCTL_AMDGPU_USER_OPTIONS /
AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY).
+ *
+ * 0 - send SIGBUS immediately (default)
+ * 0xFFFFFFFF - suppress SIGBUS delivery
+ * other - delay SIGBUS delivery by this many milliseconds
+ */
+ atomic_t kfd_sigbus_delay_ms;
};
int amdgpu_file_to_fpriv(struct file *filp, struct amdgpu_fpriv **fpriv);
@@ -1467,6 +1477,8 @@ int amdgpu_enable_vblank_kms(struct drm_crtc *crtc);
void amdgpu_disable_vblank_kms(struct drm_crtc *crtc);
int amdgpu_info_ioctl(struct drm_device *dev, void *data,
struct drm_file *filp);
+int amdgpu_user_options_ioctl(struct drm_device *dev, void *data,
+ struct drm_file *filp);
/*
* functions used by amdgpu_encoder.c
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 99688391e70b..cad18bd6f8b3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -3078,6 +3078,7 @@ const struct drm_ioctl_desc amdgpu_ioctls_kms[] = {
DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_SIGNAL, amdgpu_userq_signal_ioctl,
DRM_AUTH|DRM_RENDER_ALLOW),
DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_WAIT, amdgpu_userq_wait_ioctl,
DRM_AUTH|DRM_RENDER_ALLOW),
DRM_IOCTL_DEF_DRV(AMDGPU_GEM_LIST_HANDLES,
amdgpu_gem_list_handles_ioctl, DRM_AUTH|DRM_RENDER_ALLOW),
+ DRM_IOCTL_DEF_DRV(AMDGPU_USER_OPTIONS, amdgpu_user_options_ioctl,
DRM_AUTH|DRM_RENDER_ALLOW),
};
static const struct drm_driver amdgpu_kms_driver = {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 24526e92f9b8..7903587b8bbb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -1423,6 +1423,35 @@ int amdgpu_info_ioctl(struct drm_device *dev, void
*data, struct drm_file *filp)
return 0;
}
+/**
+ * amdgpu_user_options_ioctl - set per-fd user options
+ *
+ * @dev: drm dev pointer
+ * @data: pointer to struct drm_amdgpu_user_options
+ * @filp: drm file
+ *
+ * Sets options stored on the per-file amdgpu_fpriv. Currently the only
+ * supported option is %AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY which
+ * controls how KFD delivers SIGBUS for poison/RAS events to the calling
+ * process (immediate, suppressed, or delayed by N milliseconds).
+ */
+int amdgpu_user_options_ioctl(struct drm_device *dev, void *data,
+ struct drm_file *filp)
+{
+ struct amdgpu_fpriv *fpriv = filp->driver_priv;
+ struct drm_amdgpu_user_options *args = data;
+
+ switch (args->op) {
+ case AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY:
+ atomic_set(&fpriv->kfd_sigbus_delay_ms,
+ args->kfd_sigbus_delay.value);
+ return 0;
+ default:
+ DRM_DEBUG_KMS("Invalid user option op %u\n", args->op);
+ return -EINVAL;
+ }
+}
+
/**
* amdgpu_driver_open_kms - drm callback for open
*
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index e9be798c0a2b..2ff6348105b7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -29,10 +29,12 @@
#include <linux/uaccess.h>
#include <linux/mman.h>
#include <linux/memory.h>
+#include <linux/workqueue.h>
#include "kfd_priv.h"
#include "kfd_events.h"
#include "kfd_device_queue_manager.h"
#include <linux/device.h>
+#include <uapi/drm/amdgpu_drm.h>
/*
* Wrapper around wait_queue_entry_t
@@ -1337,6 +1339,115 @@ void kfd_signal_reset_event(struct kfd_node *dev)
srcu_read_unlock(&kfd_processes_srcu, idx);
}
+/*
+ * Per-process opt-in for poison-consumption SIGBUS handling.
+ *
+ * Default: kernel sends SIGBUS to the process immediately when poison is
+ * consumed, in addition to delivering the KFD HW/MEMORY exception events.
+ *
+ * Userspace (ROCr) can opt-in per-process via the
+ * DRM_IOCTL_AMDGPU_USER_OPTIONS / AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY
+ * option. This lets the app's registered system-event callback handle the
+ * RAS error first, instead of being killed by SIGBUS.
+ *
+ * Encoded value (set on any of the process' amdgpu render fds):
+ * 0 - default: SIGBUS immediately (no opt-in)
+ * 0xFFFFFFFF - opt-in, never escalate to SIGBUS
+ * N (other) - opt-in, escalate to SIGBUS after N ms if app does not
+ * handle the error in time (safety timeout)
+ *
+ * Per-process scope: the option is honored if ANY of the process' amdgpu
+ * fds has been configured. This matches the slide deck's "Per-process,
+ * App set at init" semantics, while keeping the UAPI on amdgpu where ROCr
+ * sets it.
+ */
+struct kfd_sigbus_delayed_work {
+ struct delayed_work work;
+ struct kfd_process *p;
+};
+
+static void kfd_signal_sigbus_delayed_fn(struct work_struct *work)
+{
+ struct kfd_sigbus_delayed_work *dw = container_of(to_delayed_work(work),
+ struct kfd_sigbus_delayed_work, work);
+ struct kfd_process *p = dw->p;
+
+ if (p->lead_thread)
+ send_sig(SIGBUS, p->lead_thread, 0);
+
+ kfd_unref_process(p);
+ kfree(dw);
+}
+
+/*
+ * Resolve the per-process SIGBUS opt-in setting by scanning all of the
+ * process' KFD pdds (each backed by an amdgpu render fd). Returns the
+ * "most lenient" value across all fds, in this priority:
+ * DISABLED (no SIGBUS) > any non-zero timeout > 0 (immediate)
+ *
+ * Rationale: if the app has explicitly opted in on any GPU it uses, it
+ * wants the chance to handle the error in userspace.
+ */
+static u32 kfd_get_sigbus_delay_ms(struct kfd_process *p)