Add a new DRM_IOCTL_AMDGPU_USER_OPTIONS ioctl with the AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY option, allowing userspace (ROCr) to control per-process SIGBUS delivery.
Userspace for this can be found at: https://github.com/ROCm/rocm-systems/pull/6190 Signed-off-by: Yifan Zhang <[email protected]> --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 + drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 6 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 27 +++++++++ drivers/gpu/drm/amd/amdkfd/kfd_events.c | 70 +++++++++++++++++++++- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 14 +++++ drivers/gpu/drm/amd/amdkfd/kfd_process.c | 23 +++++++ include/uapi/drm/amdgpu_drm.h | 21 +++++++ 8 files changed, 163 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 5d7bfa59424a..771ec0608270 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1467,6 +1467,8 @@ int amdgpu_enable_vblank_kms(struct drm_crtc *crtc); void amdgpu_disable_vblank_kms(struct drm_crtc *crtc); int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp); +int amdgpu_user_options_ioctl(struct drm_device *dev, void *data, + struct drm_file *filp); /* * functions used by amdgpu_encoder.c diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 5333e052d56d..68d83a6e6b3a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -210,6 +210,7 @@ int amdgpu_amdkfd_evict_userptr(struct mmu_interval_notifier *mni, int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo, uint32_t domain, struct dma_fence *fence); +int amdgpu_amdkfd_set_sigbus_delay(struct task_struct *task, u32 ms); #else static inline bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct *mm) @@ -241,6 +242,11 @@ int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo, { return 0; } +static inline +int amdgpu_amdkfd_set_sigbus_delay(struct task_struct *task, u32 ms) +{ + return -EOPNOTSUPP; +} #endif /* Shared API */ int amdgpu_amdkfd_alloc_kernel_mem(struct amdgpu_device *adev, size_t size, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 1781c0c3d010..4d4d21babc61 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -3076,6 +3076,7 @@ const struct drm_ioctl_desc amdgpu_ioctls_kms[] = { DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_SIGNAL, amdgpu_userq_signal_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_WAIT, amdgpu_userq_wait_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(AMDGPU_GEM_LIST_HANDLES, amdgpu_gem_list_handles_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(AMDGPU_USER_OPTIONS, amdgpu_user_options_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), }; static const struct drm_driver amdgpu_kms_driver = { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index 24526e92f9b8..772e0fda7e14 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -1423,6 +1423,33 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) return 0; } +/** + * amdgpu_user_options_ioctl - set per-fd user options + * + * @dev: drm dev pointer + * @data: pointer to struct drm_amdgpu_user_options + * @filp: drm file + * + * Sets options stored on the per-file amdgpu_fpriv. Currently the only + * supported option is %AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY which + * controls how KFD delivers SIGBUS for poison/RAS events to the calling + * process (immediate, suppressed, or delayed by N milliseconds). + */ +int amdgpu_user_options_ioctl(struct drm_device *dev, void *data, + struct drm_file *filp) +{ + struct drm_amdgpu_user_options *args = data; + + switch (args->op) { + case AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY: + return amdgpu_amdkfd_set_sigbus_delay(current, + args->kfd_sigbus_delay.value); + default: + DRM_DEBUG_KMS("Invalid user option op %u\n", args->op); + return -EINVAL; + } +} + /** * amdgpu_driver_open_kms - drm callback for open * diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index e9be798c0a2b..e7d70e3a7f3e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -29,10 +29,12 @@ #include <linux/uaccess.h> #include <linux/mman.h> #include <linux/memory.h> +#include <linux/workqueue.h> #include "kfd_priv.h" #include "kfd_events.h" #include "kfd_device_queue_manager.h" #include <linux/device.h> +#include <uapi/drm/amdgpu_drm.h> /* * Wrapper around wait_queue_entry_t @@ -1337,6 +1339,72 @@ void kfd_signal_reset_event(struct kfd_node *dev) srcu_read_unlock(&kfd_processes_srcu, idx); } +/* + * Per-process opt-in for poison-consumption SIGBUS handling. + * + * Default: kernel sends SIGBUS to the process immediately when poison is + * consumed, in addition to delivering the KFD HW/MEMORY exception events. + * + * Userspace (ROCr) can opt-in per-process via the + * DRM_IOCTL_AMDGPU_USER_OPTIONS / AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY + * option. This lets the app's registered system-event callback handle the + * RAS error first, instead of being killed by SIGBUS. + * + * Encoded value (stored on the kfd_process): + * 0 - default: SIGBUS immediately (no opt-in) + * 0xFFFFFFFF - opt-in, never escalate to SIGBUS + * N (other) - opt-in, escalate to SIGBUS after N ms if app does not + * handle the error in time (safety timeout) + */ + +static void kfd_signal_sigbus_delayed_fn(struct work_struct *work) +{ + struct kfd_process_device *pdd = container_of(to_delayed_work(work), + struct kfd_process_device, work); + struct kfd_process *p = pdd->process; + + if (p->lead_thread) + send_sig(SIGBUS, p->lead_thread, 0); + + kfd_unref_process(p); +} + +static void kfd_signal_sigbus_with_delay(struct kfd_node *dev, + struct kfd_process *p) +{ + struct kfd_process_device *pdd; + u32 delay_ms = atomic_read(&p->kfd_sigbus_delay_ms); + + if (delay_ms == AMDGPU_USER_OPTIONS_KFD_SIGBUS_DELAY_DISABLED) { + dev_info(dev->adev->dev, + "SIGBUS suppressed for process %s(pid:%d): app opted in to handle RAS error\n", + p->lead_thread->comm, p->lead_thread->pid); + return; + } + + if (delay_ms == 0) + goto send_now; + + pdd = kfd_get_process_device_data(dev, p); + if (!pdd) { + dev_err(dev->adev->dev, "Process device data doesn't exist\n"); + goto send_now; + } + + /* Take an extra reference for the delayed worker. */ + kref_get(&p->ref); + INIT_DELAYED_WORK(&pdd->work, kfd_signal_sigbus_delayed_fn); + + dev_info(dev->adev->dev, + "Deferring SIGBUS to process %s(pid:%d) by %u ms (RAS error opt-in safety timeout)\n", + p->lead_thread->comm, p->lead_thread->pid, delay_ms); + schedule_delayed_work(&pdd->work, msecs_to_jiffies(delay_ms)); + return; + +send_now: + send_sig(SIGBUS, p->lead_thread, 0); +} + void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid) { struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL); @@ -1391,7 +1459,7 @@ void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid) rcu_read_unlock(); /* user application will handle SIGBUS signal */ - send_sig(SIGBUS, p->lead_thread, 0); + kfd_signal_sigbus_with_delay(dev, p); kfd_unref_process(p); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index f037062c33ea..d3fcf07c0ebe 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -875,6 +875,9 @@ struct kfd_process_device { u32 pasid; /* Indicates this process has requested PTL stay disabled */ bool ptl_disable_req; + + /* Delayed signal to user */ + struct delayed_work work; }; #define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd) @@ -957,6 +960,17 @@ struct kfd_process { size_t signal_event_count; bool signal_event_limit_reached; + /** + * @kfd_sigbus_delay_ms: Per-process KFD SIGBUS delivery option for + * poison/RAS events (set via DRM_IOCTL_AMDGPU_USER_OPTIONS / + * AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY). + * + * 0 - send SIGBUS immediately (default) + * 0xFFFFFFFF - suppress SIGBUS delivery + * other - delay SIGBUS delivery by this many milliseconds + */ + atomic_t kfd_sigbus_delay_ms; + /* Information used for memory eviction */ void *kgd_process_info; /* Eviction fence that is attached to all the BOs of this process. The diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 419bb8086ccd..dadb7cf7b072 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -992,6 +992,29 @@ struct kfd_process *kfd_create_process(struct task_struct *thread) return process; } +/** + * amdgpu_amdkfd_set_sigbus_delay - Set per-process KFD SIGBUS delay + * @task: task in the target process + * @ms: encoded delay value (0 = immediate, 0xFFFFFFFF = suppress, + * otherwise delay in milliseconds) + * + * Stores the SIGBUS delivery option on the kfd_process associated with + * @task. If no kfd_process exists yet, one is created so the option + * persists until poison/RAS events are signaled. + */ +int amdgpu_amdkfd_set_sigbus_delay(struct task_struct *task, u32 ms) +{ + struct kfd_process *p; + + p = kfd_create_process(task); + if (IS_ERR(p)) + return PTR_ERR(p); + + atomic_set(&p->kfd_sigbus_delay_ms, ms); + kfd_unref_process(p); + return 0; +} + static struct kfd_process *find_process_by_mm(const struct mm_struct *mm) { struct kfd_process *process; diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index 9f3090db2f16..ab71c4b4aeac 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -58,6 +58,7 @@ extern "C" { #define DRM_AMDGPU_USERQ_SIGNAL 0x17 #define DRM_AMDGPU_USERQ_WAIT 0x18 #define DRM_AMDGPU_GEM_LIST_HANDLES 0x19 +#define DRM_AMDGPU_USER_OPTIONS 0x1A #define DRM_IOCTL_AMDGPU_GEM_CREATE DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_CREATE, union drm_amdgpu_gem_create) #define DRM_IOCTL_AMDGPU_GEM_MMAP DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_MMAP, union drm_amdgpu_gem_mmap) @@ -79,6 +80,7 @@ extern "C" { #define DRM_IOCTL_AMDGPU_USERQ_SIGNAL DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USERQ_SIGNAL, struct drm_amdgpu_userq_signal) #define DRM_IOCTL_AMDGPU_USERQ_WAIT DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USERQ_WAIT, struct drm_amdgpu_userq_wait) #define DRM_IOCTL_AMDGPU_GEM_LIST_HANDLES DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_LIST_HANDLES, struct drm_amdgpu_gem_list_handles) +#define DRM_IOCTL_AMDGPU_USER_OPTIONS DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USER_OPTIONS, struct drm_amdgpu_user_options) /** * DOC: memory domains @@ -1673,6 +1675,25 @@ struct drm_amdgpu_info_uq_metadata { #define AMDGPU_FAMILY_GC_11_5_4 154 /* GC 11.5.4 */ #define AMDGPU_FAMILY_GC_12_0_0 152 /* GC 12.0.0 */ +/* + * Definition of user options + * + * option: AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY + * 0: Disable sigbus delay - SIGBUS will be raised immediately + * 0xFFFFFFFF: SIGBUS will not be raised + * other: Set the sigbus delay in milliseconds + */ +#define AMDGPU_USER_OPTIONS_OP_KFD_SIGBUS_DELAY 0 + +#define AMDGPU_USER_OPTIONS_KFD_SIGBUS_DELAY_DISABLED 0xFFFFFFFFu + +struct drm_amdgpu_user_options { + __u32 op; + struct { + __u32 value; + } kfd_sigbus_delay; +}; + #if defined(__cplusplus) } #endif -- 2.43.0
