Public
Reviewed-by: Harish Kasiviswanathan <[email protected]> ________________________________ From: amd-gfx <[email protected]> on behalf of Andrew Martin <[email protected]> Sent: Wednesday, May 27, 2026 5:15 PM To: [email protected] <[email protected]> Cc: Martin, Andrew <[email protected]>; Martin, Andrew <[email protected]>; Claude : Sonnet 4 <[email protected]> Subject: [PATCH v3] drm/amdkfd: Fix SMI event PID reporting for containers SMI events were reporting incorrect PIDs in containerized environments, causing test failures where container processes expected to see their namespace-local PIDs but instead received global host PIDs. The issue had two root causes: 1. Event functions were called from kernel context (page fault handlers, migration workers) where 'current' refers to the kernel worker thread, not the userspace GPU process that triggered the event. 2. PID conversion used task_tgid_vnr() which returns the PID in the caller's namespace (init namespace for kernel threads), not the task's own namespace. This patch updates the SMI event interface: - Change 8 event function signatures to accept task_struct pointer instead of pid_t, allowing proper namespace-aware PID conversion - Convert PIDs using task_tgid_nr_ns(task, task_active_pid_ns(task)) which returns the PID as the process sees it via getpid() - Update 10 call sites to pass p->lead_thread (the GPU process) instead of p->lead_thread->pid or current (kernel worker) This ensures SMI events report container-local PIDs, which is critical for containerized GPU workloads to correctly correlate events with their processes. Tested-by: Andrew Martin <[email protected]> Assisted-by: Claude:Sonnet 4 <[email protected]> Signed-off-by: Andrew Martin <[email protected]> --- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 8 +- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 6 +- drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 95 +++++++++++++-------- drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 14 +-- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 6 +- 5 files changed, 76 insertions(+), 53 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index 28dc6886c1ff..226e76ae0be7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -424,7 +424,7 @@ svm_migrate_vma_to_vram(struct kfd_node *node, struct svm_range *prange, migrate.dst = migrate.src + npages; scratch = (dma_addr_t *)(migrate.dst + npages); - kfd_smi_event_migration_start(node, p->lead_thread->pid, + kfd_smi_event_migration_start(node, p->lead_thread, start >> PAGE_SHIFT, end >> PAGE_SHIFT, 0, node->id, prange->prefetch_loc, prange->preferred_loc, trigger); @@ -462,7 +462,7 @@ svm_migrate_vma_to_vram(struct kfd_node *node, struct svm_range *prange, out_free: kvfree(buf); - kfd_smi_event_migration_end(node, p->lead_thread->pid, + kfd_smi_event_migration_end(node, p->lead_thread, start >> PAGE_SHIFT, end >> PAGE_SHIFT, 0, node->id, trigger, r); out: @@ -727,7 +727,7 @@ svm_migrate_vma_to_ram(struct kfd_node *node, struct svm_range *prange, migrate.fault_page = fault_page; scratch = (dma_addr_t *)(migrate.dst + npages); - kfd_smi_event_migration_start(node, p->lead_thread->pid, + kfd_smi_event_migration_start(node, p->lead_thread, start >> PAGE_SHIFT, end >> PAGE_SHIFT, node->id, 0, prange->prefetch_loc, prange->preferred_loc, trigger); @@ -766,7 +766,7 @@ svm_migrate_vma_to_ram(struct kfd_node *node, struct svm_range *prange, out_free: kvfree(buf); - kfd_smi_event_migration_end(node, p->lead_thread->pid, + kfd_smi_event_migration_end(node, p->lead_thread, start >> PAGE_SHIFT, end >> PAGE_SHIFT, node->id, 0, trigger, r); out: diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 419bb8086ccd..0ed315eb0bca 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -1975,7 +1975,7 @@ int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger) struct kfd_process_device *pdd = p->pdds[i]; struct device *dev = pdd->dev->adev->dev; - kfd_smi_event_queue_eviction(pdd->dev, p->lead_thread->pid, + kfd_smi_event_queue_eviction(pdd->dev, p->lead_thread, trigger); r = pdd->dev->dqm->ops.evict_process_queues(pdd->dev->dqm, @@ -2005,7 +2005,7 @@ int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger) if (n_evicted == 0) break; - kfd_smi_event_queue_restore(pdd->dev, p->lead_thread->pid); + kfd_smi_event_queue_restore(pdd->dev, p->lead_thread); if (pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm, &pdd->qpd)) @@ -2028,7 +2028,7 @@ int kfd_process_restore_queues(struct kfd_process *p) struct kfd_process_device *pdd = p->pdds[i]; struct device *dev = pdd->dev->adev->dev; - kfd_smi_event_queue_restore(pdd->dev, p->lead_thread->pid); + kfd_smi_event_queue_restore(pdd->dev, p->lead_thread); r = pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm, &pdd->qpd); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index d2bc169e84b0..a08fdef97917 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -195,17 +195,35 @@ static void add_event_to_kfifo(pid_t pid, struct kfd_node *dev, rcu_read_unlock(); } +/** + * kfd_smi_task_to_pid - Convert task to namespace-aware PID + * @task: task_struct pointer (typically p->lead_thread) + * + * Returns the PID as it appears in the task's own PID namespace. + * For containerized processes, this returns the container-local PID + * (what getpid() returns), not the global host PID. + * + * Returns 0 if task is NULL. + */ +static inline pid_t kfd_smi_task_to_pid(struct task_struct *task) +{ + return task ? task_tgid_nr_ns(task, task_active_pid_ns(task)) : 0; +} + __printf(4, 5) -static void kfd_smi_event_add(pid_t pid, struct kfd_node *dev, +static void kfd_smi_event_add(struct task_struct *task, struct kfd_node *dev, unsigned int event, char *fmt, ...) { char fifo_in[KFD_SMI_EVENT_MSG_SIZE]; int len; va_list args; + pid_t pid; if (list_empty(&dev->smi_clients)) return; + pid = kfd_smi_task_to_pid(task); + len = snprintf(fifo_in, sizeof(fifo_in), "%x ", event); va_start(args, fmt); @@ -234,14 +252,15 @@ void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset, amdgpu_reset_get_desc(reset_context, reset_cause, sizeof(reset_cause)); - kfd_smi_event_add(0, dev, event, KFD_EVENT_FMT_UPDATE_GPU_RESET( + kfd_smi_event_add(NULL, dev, event, KFD_EVENT_FMT_UPDATE_GPU_RESET( dev->reset_seq_num, reset_cause)); } void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev, uint64_t throttle_bitmask) { - kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, KFD_EVENT_FMT_THERMAL_THROTTLING( + kfd_smi_event_add(NULL, dev, KFD_SMI_EVENT_THERMAL_THROTTLE, + KFD_EVENT_FMT_THERMAL_THROTTLING( throttle_bitmask, amdgpu_dpm_get_thermal_throttling_counter(dev->adev))); } @@ -254,65 +273,67 @@ void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid) if (task_info) { /* Report VM faults from user applications, not retry from kernel */ if (task_info->task.pid) - kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, KFD_EVENT_FMT_VMFAULT( + kfd_smi_event_add(NULL, dev, KFD_SMI_EVENT_VMFAULT, KFD_EVENT_FMT_VMFAULT( task_info->task.pid, task_info->task.comm)); amdgpu_vm_put_task_info(task_info); } } -void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid, +void kfd_smi_event_page_fault_start(struct kfd_node *node, struct task_struct *task, unsigned long address, bool write_fault, ktime_t ts) { - kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_START, - KFD_EVENT_FMT_PAGEFAULT_START(ktime_to_ns(ts), pid, - address, node->id, write_fault ? 'W' : 'R')); + kfd_smi_event_add(task, node, KFD_SMI_EVENT_PAGE_FAULT_START, + KFD_EVENT_FMT_PAGEFAULT_START(ktime_to_ns(ts), + kfd_smi_task_to_pid(task), address, node->id, + write_fault ? 'W' : 'R')); } -void kfd_smi_event_page_fault_end(struct kfd_node *node, pid_t pid, +void kfd_smi_event_page_fault_end(struct kfd_node *node, struct task_struct *task, unsigned long address, bool migration) { - kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_END, + kfd_smi_event_add(task, node, KFD_SMI_EVENT_PAGE_FAULT_END, KFD_EVENT_FMT_PAGEFAULT_END(ktime_get_boottime_ns(), - pid, address, node->id, migration ? 'M' : 'U')); + kfd_smi_task_to_pid(task), address, node->id, + migration ? 'M' : 'U')); } -void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid, +void kfd_smi_event_migration_start(struct kfd_node *node, struct task_struct *task, unsigned long start, unsigned long end, uint32_t from, uint32_t to, uint32_t prefetch_loc, uint32_t preferred_loc, uint32_t trigger) { - kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_START, - KFD_EVENT_FMT_MIGRATE_START( - ktime_get_boottime_ns(), pid, start, end - start, - from, to, prefetch_loc, preferred_loc, trigger)); + kfd_smi_event_add(task, node, KFD_SMI_EVENT_MIGRATE_START, + KFD_EVENT_FMT_MIGRATE_START(ktime_get_boottime_ns(), + kfd_smi_task_to_pid(task), start, end - start, from, + to, prefetch_loc, preferred_loc, trigger)); } -void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid, +void kfd_smi_event_migration_end(struct kfd_node *node, struct task_struct *task, unsigned long start, unsigned long end, uint32_t from, uint32_t to, uint32_t trigger, int error_code) { - kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_END, - KFD_EVENT_FMT_MIGRATE_END( - ktime_get_boottime_ns(), pid, start, end - start, - from, to, trigger, error_code)); + kfd_smi_event_add(task, node, KFD_SMI_EVENT_MIGRATE_END, + KFD_EVENT_FMT_MIGRATE_END(ktime_get_boottime_ns(), + kfd_smi_task_to_pid(task), start, end - start, from, + to, trigger, error_code)); } -void kfd_smi_event_queue_eviction(struct kfd_node *node, pid_t pid, +void kfd_smi_event_queue_eviction(struct kfd_node *node, struct task_struct *task, uint32_t trigger) { - kfd_smi_event_add(pid, node, KFD_SMI_EVENT_QUEUE_EVICTION, - KFD_EVENT_FMT_QUEUE_EVICTION(ktime_get_boottime_ns(), pid, - node->id, trigger)); + kfd_smi_event_add(task, node, KFD_SMI_EVENT_QUEUE_EVICTION, + KFD_EVENT_FMT_QUEUE_EVICTION(ktime_get_boottime_ns(), + kfd_smi_task_to_pid(task), node->id, trigger)); } -void kfd_smi_event_queue_restore(struct kfd_node *node, pid_t pid) +void kfd_smi_event_queue_restore(struct kfd_node *node, struct task_struct *task) { - kfd_smi_event_add(pid, node, KFD_SMI_EVENT_QUEUE_RESTORE, - KFD_EVENT_FMT_QUEUE_RESTORE(ktime_get_boottime_ns(), pid, - node->id, '0')); + kfd_smi_event_add(task, node, KFD_SMI_EVENT_QUEUE_RESTORE, + KFD_EVENT_FMT_QUEUE_RESTORE(ktime_get_boottime_ns(), + kfd_smi_task_to_pid(task), node->id, '0')); } void kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm) @@ -327,21 +348,23 @@ void kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm) for (i = 0; i < p->n_pdds; i++) { struct kfd_process_device *pdd = p->pdds[i]; - kfd_smi_event_add(p->lead_thread->pid, pdd->dev, + kfd_smi_event_add(p->lead_thread, pdd->dev, KFD_SMI_EVENT_QUEUE_RESTORE, KFD_EVENT_FMT_QUEUE_RESTORE(ktime_get_boottime_ns(), - p->lead_thread->pid, pdd->dev->id, 'R')); + kfd_smi_task_to_pid(p->lead_thread), + pdd->dev->id, 'R')); } kfd_unref_process(p); } -void kfd_smi_event_unmap_from_gpu(struct kfd_node *node, pid_t pid, +void kfd_smi_event_unmap_from_gpu(struct kfd_node *node, struct task_struct *task, unsigned long address, unsigned long last, uint32_t trigger) { - kfd_smi_event_add(pid, node, KFD_SMI_EVENT_UNMAP_FROM_GPU, + kfd_smi_event_add(task, node, KFD_SMI_EVENT_UNMAP_FROM_GPU, KFD_EVENT_FMT_UNMAP_FROM_GPU(ktime_get_boottime_ns(), - pid, address, last - address + 1, node->id, trigger)); + kfd_smi_task_to_pid(task), address, + last - address + 1, node->id, trigger)); } void kfd_smi_event_process(struct kfd_process_device *pdd, bool start) @@ -356,7 +379,7 @@ void kfd_smi_event_process(struct kfd_process_device *pdd, bool start) task_info = amdgpu_vm_get_task_info_vm(avm); if (task_info) { - kfd_smi_event_add(0, pdd->dev, + kfd_smi_event_add(NULL, pdd->dev, start ? KFD_SMI_EVENT_PROCESS_START : KFD_SMI_EVENT_PROCESS_END, KFD_EVENT_FMT_PROCESS(task_info->task.pid, @@ -385,7 +408,7 @@ int kfd_smi_event_open(struct kfd_node *dev, uint32_t *fd) spin_lock_init(&client->lock); client->events = 0; client->dev = dev; - client->pid = current->tgid; + client->pid = kfd_smi_task_to_pid(current); client->suser = capable(CAP_SYS_ADMIN); spin_lock(&dev->smi_lock); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h index bb4d72b57387..afa93d7cfa7f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h @@ -32,25 +32,25 @@ void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev, uint64_t throttle_bitmask); void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset, struct amdgpu_reset_context *reset_context); -void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid, +void kfd_smi_event_page_fault_start(struct kfd_node *node, struct task_struct *task, unsigned long address, bool write_fault, ktime_t ts); -void kfd_smi_event_page_fault_end(struct kfd_node *node, pid_t pid, +void kfd_smi_event_page_fault_end(struct kfd_node *node, struct task_struct *task, unsigned long address, bool migration); -void kfd_smi_event_migration_start(struct kfd_node *node, pid_t pid, +void kfd_smi_event_migration_start(struct kfd_node *node, struct task_struct *task, unsigned long start, unsigned long end, uint32_t from, uint32_t to, uint32_t prefetch_loc, uint32_t preferred_loc, uint32_t trigger); -void kfd_smi_event_migration_end(struct kfd_node *node, pid_t pid, +void kfd_smi_event_migration_end(struct kfd_node *node, struct task_struct *task, unsigned long start, unsigned long end, uint32_t from, uint32_t to, uint32_t trigger, int error_code); -void kfd_smi_event_queue_eviction(struct kfd_node *node, pid_t pid, +void kfd_smi_event_queue_eviction(struct kfd_node *node, struct task_struct *task, uint32_t trigger); -void kfd_smi_event_queue_restore(struct kfd_node *node, pid_t pid); +void kfd_smi_event_queue_restore(struct kfd_node *node, struct task_struct *task); void kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm); -void kfd_smi_event_unmap_from_gpu(struct kfd_node *node, pid_t pid, +void kfd_smi_event_unmap_from_gpu(struct kfd_node *node, struct task_struct *task, unsigned long address, unsigned long last, uint32_t trigger); void kfd_smi_event_process(struct kfd_process_device *pdd, bool start); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 25b3ecf85f30..b9f62129a825 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -1408,7 +1408,7 @@ svm_range_unmap_from_gpus(struct svm_range *prange, unsigned long start, return -EINVAL; } - kfd_smi_event_unmap_from_gpu(pdd->dev, p->lead_thread->pid, + kfd_smi_event_unmap_from_gpu(pdd->dev, p->lead_thread, start, last, trigger); r = svm_range_unmap_from_gpu(pdd->dev->adev, @@ -3205,7 +3205,7 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, svms, prange->start, prange->last, best_loc, prange->actual_loc); - kfd_smi_event_page_fault_start(node, p->lead_thread->pid, addr, + kfd_smi_event_page_fault_start(node, p->lead_thread, addr, write_fault, timestamp); /* Align migration range start and size to granularity size */ @@ -3248,7 +3248,7 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, r, svms, start, last); out_migrate_fail: - kfd_smi_event_page_fault_end(node, p->lead_thread->pid, addr, + kfd_smi_event_page_fault_end(node, p->lead_thread, addr, migration); out_unlock_range: -- 2.34.1
