On 5/20/26 14:25, Pierre-Eric Pelloux-Prayer wrote: > > > Le 20/05/2026 à 11:14, Christian König a écrit : >> On 5/20/26 08:38, Prike Liang wrote: >>> From: Pierre-Eric Pelloux-Prayer <[email protected]> >>> >>> Add ftrace events for tracking the userq fence emit, signal >>> and queue state transition. >> >> The queue trace points look good to me, but clear NAK to the fence trace >> points those just duplicates the common trace points in the dma_fence >> framework. > > The dma_fence trace points don't contain enough context to be usable from a > tool (no device, no client id at the very least). > > The userqueue events are based on the gpu_scheduler traces and are what is > required for UMR to implement its Activity view.
In that case we should change umr to use the fence context instead of the client id and/or put the client/doorbell in the fence descripton. That's what this is good for. Creating new trace points to track userqueue usage and not using the standard dma_fence onces is an absolutely clear NO-GO from my side, do we also do that for the scheduler? Regards, Christian. > > Pierre-Eric > >> >> Regards, >> Christian. >> >>> >>> Signed-off-by: Pierre-Eric Pelloux-Prayer >>> <[email protected]> >>> Signed-off-by: Prike Liang <[email protected]> >>> --- >>> drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h | 129 ++++++++++++++++++ >>> drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 21 +++ >>> .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 13 +- >>> 3 files changed, 160 insertions(+), 3 deletions(-) >>> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h >>> index 4ff8a4d7bb8b..32d8c36caaf3 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h >>> @@ -28,6 +28,8 @@ >>> #include <linux/types.h> >>> #include <linux/tracepoint.h> >>> +#include "amdgpu_userq_fence.h" >>> + >>> #undef TRACE_SYSTEM >>> #define TRACE_SYSTEM amdgpu >>> #define TRACE_INCLUDE_FILE amdgpu_trace >>> @@ -659,6 +661,133 @@ DEFINE_EVENT(amdgpu_userq_eviction_fence, >>> amdgpu_userq_eviction_fence_enable_sig >>> DEFINE_EVENT(amdgpu_userq_eviction_fence, >>> amdgpu_userq_eviction_fence_signal, >>> TP_PROTO(u64 context, u64 seqno), >>> TP_ARGS(context, seqno)); >>> +TRACE_EVENT(amdgpu_userq_job_run, >>> + TP_PROTO(struct device *device, struct amdgpu_usermode_queue >>> *queue, struct amdgpu_userq_fence *fence), >>> + TP_ARGS(device, queue, fence), >>> + TP_STRUCT__entry( >>> + __field(u64, fence_context) >>> + __field(u64, fence_seqno) >>> + __string(dev, dev_name(device)) >>> + __field(u64, doorbell_index) >>> + __field(u64, client_id) >>> + __field(u32, queue_type) >>> + ), >>> + TP_fast_assign( >>> + __entry->fence_context = fence->base.context; >>> + __entry->fence_seqno = fence->base.seqno; >>> + __assign_str(dev); >>> + __entry->doorbell_index = queue->doorbell_index; >>> + __entry->client_id = queue->userq_mgr->file->client_id; >>> + __entry->queue_type = queue->queue_type; >>> + ), >>> + TP_printk("dev=%s, client_id=%llu, type=%u, doorbell=%llu, >>> fence=%llu:%llu", >>> + __get_str(dev), __entry->client_id, __entry->queue_type, >>> __entry->doorbell_index, >>> + __entry->fence_context, >>> + __entry->fence_seqno) >>> +); >>> + >>> +TRACE_EVENT(amdgpu_userq_job_done, >>> + TP_PROTO(struct amdgpu_userq_fence *fence), >>> + TP_ARGS(fence), >>> + TP_STRUCT__entry( >>> + __field(u64, fence_context) >>> + __field(u64, fence_seqno) >>> + ), >>> + TP_fast_assign( >>> + __entry->fence_context = fence->base.context; >>> + __entry->fence_seqno = fence->base.seqno; >>> + ), >>> + TP_printk("fence=%llu:%llu", >>> + __entry->fence_context, >>> + __entry->fence_seqno) >>> +); >>> + >>> +TRACE_EVENT(amdgpu_userq_job_queue, >>> + TP_PROTO(struct device *device, >>> + struct amdgpu_usermode_queue *queue), >>> + TP_ARGS(device, queue), >>> + TP_STRUCT__entry(__field(u64, context) >>> + __string(dev, dev_name(device)) >>> + __field(u64, doorbell_index) >>> + __field(u64, client_id) >>> + __field(u32, queue_type) >>> + ), >>> + TP_fast_assign(__assign_str(dev); >>> + __entry->doorbell_index = queue->doorbell_index; >>> + __entry->queue_type = queue->queue_type; >>> + __entry->client_id = queue->userq_mgr->file->client_id; >>> + __entry->context = queue->fence_drv->context; >>> + ), >>> + TP_printk("dev=%s, client_id=%llu, type=%u, doorbell=%llu, >>> context=%llu", >>> + __get_str(dev), __entry->client_id, __entry->queue_type, >>> + __entry->doorbell_index, __entry->context) >>> +); >>> + >>> +TRACE_EVENT(amdgpu_userq_job_add_dep, >>> + TP_PROTO(struct device *device, struct amdgpu_usermode_queue >>> *queue, struct amdgpu_userq_fence *dep), >>> + TP_ARGS(device, queue, dep), >>> + TP_STRUCT__entry( >>> + __field(u64, context) >>> + __field(u64, dep_context) >>> + __field(u64, dep_seqno) >>> + __string(dev, dev_name(device)) >>> + __field(u64, doorbell_index) >>> + __field(u64, client_id) >>> + __field(u32, queue_type) >>> + ), >>> + TP_fast_assign( >>> + __assign_str(dev); >>> + __entry->doorbell_index = queue->doorbell_index; >>> + __entry->queue_type = queue->queue_type; >>> + __entry->client_id = queue->userq_mgr->file->client_id; >>> + __entry->context = queue->fence_drv->context; >>> + __entry->dep_context = dep->base.context; >>> + __entry->dep_seqno = dep->base.seqno; >>> + ), >>> + TP_printk("dev=%s, client_id=%llu, type=%u, doorbell=%llu, >>> context=%llu depends on fence=%llu:%llu", >>> + __get_str(dev), __entry->client_id, __entry->queue_type, >>> __entry->doorbell_index, __entry->context, >>> + __entry->dep_context, >>> + __entry->dep_seqno) >>> +); >>> + >>> +TRACE_EVENT(amdgpu_userq_state_start, >>> + TP_PROTO(struct amdgpu_usermode_queue *queue), >>> + TP_ARGS(queue), >>> + TP_STRUCT__entry( >>> + __field(u64, doorbell_index) >>> + __field(u64, client_id) >>> + __field(u32, queue_type) >>> + __field(u32, from) >>> + ), >>> + TP_fast_assign( >>> + __entry->doorbell_index = queue->doorbell_index; >>> + __entry->queue_type = queue->queue_type; >>> + __entry->client_id = queue->userq_mgr->file->client_id; >>> + __entry->from = queue->state; >>> + ), >>> + TP_printk("client_id=%llu, type=%u, doorbell=%llu, from=%d", >>> + __entry->client_id, __entry->queue_type, >>> __entry->doorbell_index, __entry->from) >>> +); >>> + >>> +TRACE_EVENT(amdgpu_userq_state_changed, >>> + TP_PROTO(struct amdgpu_usermode_queue *queue, enum >>> amdgpu_userq_state new_state), >>> + TP_ARGS(queue, new_state), >>> + TP_STRUCT__entry( >>> + __field(u64, doorbell_index) >>> + __field(u64, client_id) >>> + __field(u32, queue_type) >>> + __field(u32, to) >>> + ), >>> + TP_fast_assign( >>> + __entry->doorbell_index = queue->doorbell_index; >>> + __entry->queue_type = queue->queue_type; >>> + __entry->client_id = queue->userq_mgr->file->client_id; >>> + __entry->to = new_state; >>> + ), >>> + TP_printk("client_id=%llu, type=%u, doorbell=%llu, to=%d", >>> + __entry->client_id, __entry->queue_type, >>> __entry->doorbell_index, __entry->to) >>> +); >>> + >>> #undef AMDGPU_JOB_GET_TIMELINE_NAME >>> #endif >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c >>> index e27f9a76f986..60d1186af286 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c >>> @@ -329,11 +329,15 @@ static int amdgpu_userq_preempt_helper(struct >>> amdgpu_usermode_queue *queue) >>> int r; >>> if (queue->state == AMDGPU_USERQ_STATE_MAPPED) { >>> + trace_amdgpu_userq_state_start(queue); >>> + >>> r = userq_funcs->preempt(queue); >>> if (r) { >>> + trace_amdgpu_userq_state_changed(queue, >>> AMDGPU_USERQ_STATE_HUNG); >>> queue->state = AMDGPU_USERQ_STATE_HUNG; >>> return r; >>> } else { >>> + trace_amdgpu_userq_state_changed(queue, >>> AMDGPU_USERQ_STATE_PREEMPTED); >>> queue->state = AMDGPU_USERQ_STATE_PREEMPTED; >>> } >>> } >>> @@ -349,10 +353,14 @@ static int amdgpu_userq_restore_helper(struct >>> amdgpu_usermode_queue *queue) >>> int r = 0; >>> if (queue->state == AMDGPU_USERQ_STATE_PREEMPTED) { >>> + trace_amdgpu_userq_state_start(queue); >>> + >>> r = userq_funcs->restore(queue); >>> if (r) { >>> + trace_amdgpu_userq_state_changed(queue, >>> AMDGPU_USERQ_STATE_HUNG); >>> queue->state = AMDGPU_USERQ_STATE_HUNG; >>> } else { >>> + trace_amdgpu_userq_state_changed(queue, >>> AMDGPU_USERQ_STATE_MAPPED); >>> queue->state = AMDGPU_USERQ_STATE_MAPPED; >>> } >>> } >>> @@ -370,12 +378,15 @@ static int amdgpu_userq_unmap_helper(struct >>> amdgpu_usermode_queue *queue) >>> if ((queue->state == AMDGPU_USERQ_STATE_MAPPED) || >>> (queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) { >>> + trace_amdgpu_userq_state_start(queue); >>> r = userq_funcs->unmap(queue); >>> if (r) { >>> + trace_amdgpu_userq_state_changed(queue, >>> AMDGPU_USERQ_STATE_HUNG); >>> queue->state = AMDGPU_USERQ_STATE_HUNG; >>> return r; >>> } else { >>> + trace_amdgpu_userq_state_changed(queue, >>> AMDGPU_USERQ_STATE_UNMAPPED); >>> queue->state = AMDGPU_USERQ_STATE_UNMAPPED; >>> } >>> } >>> @@ -392,11 +403,15 @@ static int amdgpu_userq_map_helper(struct >>> amdgpu_usermode_queue *queue) >>> int r; >>> if (queue->state == AMDGPU_USERQ_STATE_UNMAPPED) { >>> + trace_amdgpu_userq_state_start(queue); >>> + >>> r = userq_funcs->map(queue); >>> if (r) { >>> + trace_amdgpu_userq_state_changed(queue, >>> AMDGPU_USERQ_STATE_HUNG); >>> queue->state = AMDGPU_USERQ_STATE_HUNG; >>> return r; >>> } else { >>> + trace_amdgpu_userq_state_changed(queue, >>> AMDGPU_USERQ_STATE_MAPPED); >>> queue->state = AMDGPU_USERQ_STATE_MAPPED; >>> } >>> } >>> @@ -1007,6 +1022,7 @@ amdgpu_userq_restore_all(struct amdgpu_userq_mgr >>> *uq_mgr) >>> if (!amdgpu_userq_buffer_vas_mapped(queue)) { >>> drm_file_err(uq_mgr->file, >>> "trying restore queue without va mapping\n"); >>> + trace_amdgpu_userq_state_changed(queue, >>> AMDGPU_USERQ_STATE_INVALID_VA); >>> queue->state = AMDGPU_USERQ_STATE_INVALID_VA; >>> continue; >>> } >>> @@ -1502,12 +1518,14 @@ void amdgpu_userq_pre_reset(struct amdgpu_device >>> *adev) >>> if (queue->state != AMDGPU_USERQ_STATE_MAPPED) >>> continue; >>> + trace_amdgpu_userq_state_start(queue); >>> userq_funcs = adev->userq_funcs[queue->queue_type]; >>> userq_funcs->unmap(queue); >>> /* just mark all queues as hung at this point. >>> * if unmap succeeds, we could map again >>> * in amdgpu_userq_post_reset() if vram is not lost >>> */ >>> + trace_amdgpu_userq_state_changed(queue, AMDGPU_USERQ_STATE_HUNG); >>> queue->state = AMDGPU_USERQ_STATE_HUNG; >>> amdgpu_userq_fence_driver_force_completion(queue); >>> } >>> @@ -1526,6 +1544,8 @@ int amdgpu_userq_post_reset(struct amdgpu_device >>> *adev, bool vram_lost) >>> xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) { >>> if (queue->state == AMDGPU_USERQ_STATE_HUNG && !vram_lost) { >>> + trace_amdgpu_userq_state_start(queue); >>> + >>> userq_funcs = adev->userq_funcs[queue->queue_type]; >>> /* Re-map queue */ >>> r = userq_funcs->map(queue); >>> @@ -1533,6 +1553,7 @@ int amdgpu_userq_post_reset(struct amdgpu_device >>> *adev, bool vram_lost) >>> dev_err(adev->dev, "Failed to remap queue %ld\n", >>> queue_id); >>> continue; >>> } >>> + trace_amdgpu_userq_state_changed(queue, >>> AMDGPU_USERQ_STATE_MAPPED); >>> queue->state = AMDGPU_USERQ_STATE_MAPPED; >>> } >>> } >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c >>> index 008330a0d852..00cc7194321c 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c >>> @@ -30,7 +30,7 @@ >>> #include <drm/drm_syncobj.h> >>> #include "amdgpu.h" >>> -#include "amdgpu_userq_fence.h" >>> +#include "amdgpu_trace.h" >>> #define AMDGPU_USERQ_MAX_HANDLES (1U << 16) >>> @@ -169,6 +169,7 @@ amdgpu_userq_fence_driver_process(struct >>> amdgpu_userq_fence_driver *fence_drv) >>> fence = &userq_fence->base; >>> list_del_init(&userq_fence->link); >>> dma_fence_signal(fence); >>> + trace_amdgpu_userq_job_done(userq_fence); >>> /* Drop fence_drv_array outside fence_list_lock >>> * to avoid the recursion lock. >>> */ >>> @@ -528,6 +529,8 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, >>> void *data, >>> /* Create the new fence */ >>> amdgpu_userq_fence_init(queue, fence, wptr); >>> + trace_amdgpu_userq_job_run(dev->dev, queue, fence); >>> + >>> mutex_unlock(&userq_mgr->userq_mutex); >>> /* >>> @@ -701,7 +704,7 @@ amdgpu_userq_wait_add_fence(struct >>> drm_amdgpu_userq_wait *wait_info, >>> } >>> static int >>> -amdgpu_userq_wait_return_fence_info(struct drm_file *filp, >>> +amdgpu_userq_wait_return_fence_info(struct drm_device *dev, struct >>> drm_file *filp, >>> struct drm_amdgpu_userq_wait *wait_info, >>> u32 *syncobj_handles, u32 *timeline_points, >>> u32 *timeline_handles, >>> @@ -835,6 +838,8 @@ amdgpu_userq_wait_return_fence_info(struct drm_file >>> *filp, >>> goto free_fences; >>> } >>> + trace_amdgpu_userq_job_queue(dev->dev, waitq); >>> + >>> for (i = 0, cnt = 0; i < num_fences; i++) { >>> struct amdgpu_userq_fence_driver *fence_drv; >>> struct amdgpu_userq_fence *userq_fence; >>> @@ -869,6 +874,8 @@ amdgpu_userq_wait_return_fence_info(struct drm_file >>> *filp, >>> amdgpu_userq_fence_driver_get(fence_drv); >>> + trace_amdgpu_userq_job_add_dep(dev->dev, waitq, userq_fence); >>> + >>> /* Store drm syncobj's gpu va address and value */ >>> fence_info[cnt].va = fence_drv->va; >>> fence_info[cnt].value = fences[i]->seqno; >>> @@ -968,7 +975,7 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, >>> void *data, >>> gobj_write, >>> gobj_read); >>> } else { >>> - r = amdgpu_userq_wait_return_fence_info(filp, wait_info, >>> + r = amdgpu_userq_wait_return_fence_info(dev, filp, wait_info, >>> syncobj_handles, >>> timeline_points, >>> timeline_handles,
