On 5/20/26 15:28, Pierre-Eric Pelloux-Prayer wrote: > Le 20/05/2026 à 14:33, Christian König a écrit : >> On 5/20/26 14:25, Pierre-Eric Pelloux-Prayer wrote: >>> >>> >>> Le 20/05/2026 à 11:14, Christian König a écrit : >>>> On 5/20/26 08:38, Prike Liang wrote: >>>>> From: Pierre-Eric Pelloux-Prayer <[email protected]> >>>>> >>>>> Add ftrace events for tracking the userq fence emit, signal >>>>> and queue state transition. >>>> >>>> The queue trace points look good to me, but clear NAK to the fence trace >>>> points those just duplicates the common trace points in the dma_fence >>>> framework. >>> >>> The dma_fence trace points don't contain enough context to be usable from a >>> tool (no device, no client id at the very least). >>> >>> The userqueue events are based on the gpu_scheduler traces and are what is >>> required for UMR to implement its Activity view. >> >> In that case we should change umr to use the fence context instead of the >> client id and/or put the client/doorbell in the fence descripton. That's >> what this is good for. > > It *is* using the fence context. Having the client_id helps associating with > information available elsewhere (fdinfo for instance). > >> >> Creating new trace points to track userqueue usage and not using the >> standard dma_fence onces is an absolutely clear NO-GO from my side, do we >> also do that for the scheduler? >> > > Yes, the gpu_scheduler trace events do the same thing.
Crap I completely missed that, I though that the scheduler trace points would expose additional stuff and not superseet the dma_fence trace points. Let's discuss tomorrow how to best handle that. Thanks, Christian. > > More below. > >> Regards, >> Christian. >> >>> >>> Pierre-Eric >>> >>>> >>>> Regards, >>>> Christian. >>>> >>>>> >>>>> Signed-off-by: Pierre-Eric Pelloux-Prayer >>>>> <[email protected]> >>>>> Signed-off-by: Prike Liang <[email protected]> >>>>> --- >>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h | 129 ++++++++++++++++++ >>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 21 +++ >>>>> .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 13 +- >>>>> 3 files changed, 160 insertions(+), 3 deletions(-) >>>>> >>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h >>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h >>>>> index 4ff8a4d7bb8b..32d8c36caaf3 100644 >>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h >>>>> @@ -28,6 +28,8 @@ >>>>> #include <linux/types.h> >>>>> #include <linux/tracepoint.h> >>>>> +#include "amdgpu_userq_fence.h" >>>>> + >>>>> #undef TRACE_SYSTEM >>>>> #define TRACE_SYSTEM amdgpu >>>>> #define TRACE_INCLUDE_FILE amdgpu_trace >>>>> @@ -659,6 +661,133 @@ DEFINE_EVENT(amdgpu_userq_eviction_fence, >>>>> amdgpu_userq_eviction_fence_enable_sig >>>>> DEFINE_EVENT(amdgpu_userq_eviction_fence, >>>>> amdgpu_userq_eviction_fence_signal, >>>>> TP_PROTO(u64 context, u64 seqno), >>>>> TP_ARGS(context, seqno)); >>>>> +TRACE_EVENT(amdgpu_userq_job_run, >>>>> + TP_PROTO(struct device *device, struct amdgpu_usermode_queue >>>>> *queue, struct amdgpu_userq_fence *fence), >>>>> + TP_ARGS(device, queue, fence), >>>>> + TP_STRUCT__entry( >>>>> + __field(u64, fence_context) >>>>> + __field(u64, fence_seqno) > > In the context of userq, these fields are similar to dma_fence_init. > >>>>> + __string(dev, dev_name(device)) >>>>> + __field(u64, doorbell_index) >>>>> + __field(u64, client_id) >>>>> + __field(u32, queue_type) > > These 4 are missing in dma_fence_init and useful for UMR. eg: neither > dma_fence_init nor dma_fence_signalled trace the device. They only trace the > timeline which is not unique on a system. > >>>>> + ), >>>>> + TP_fast_assign( >>>>> + __entry->fence_context = fence->base.context; >>>>> + __entry->fence_seqno = fence->base.seqno; >>>>> + __assign_str(dev); >>>>> + __entry->doorbell_index = queue->doorbell_index; >>>>> + __entry->client_id = queue->userq_mgr->file->client_id; >>>>> + __entry->queue_type = queue->queue_type; >>>>> + ), >>>>> + TP_printk("dev=%s, client_id=%llu, type=%u, doorbell=%llu, >>>>> fence=%llu:%llu", >>>>> + __get_str(dev), __entry->client_id, __entry->queue_type, >>>>> __entry->doorbell_index, >>>>> + __entry->fence_context, >>>>> + __entry->fence_seqno) >>>>> +); >>>>> + >>>>> +TRACE_EVENT(amdgpu_userq_job_done, >>>>> + TP_PROTO(struct amdgpu_userq_fence *fence), >>>>> + TP_ARGS(fence), >>>>> + TP_STRUCT__entry( >>>>> + __field(u64, fence_context) >>>>> + __field(u64, fence_seqno) >>>>> + ), >>>>> + TP_fast_assign( >>>>> + __entry->fence_context = fence->base.context; >>>>> + __entry->fence_seqno = fence->base.seqno; >>>>> + ), >>>>> + TP_printk("fence=%llu:%llu", >>>>> + __entry->fence_context, >>>>> + __entry->fence_seqno) > > This one is indeed a duplicate of dma_fence_signaled. > It exists so we have similar events as gpu_scheduler but we can get rid of it > if you want. > (the only caveat is that dma_fence_signaled traces context and seqno as 32bit > integers). > > The other events below have no dma_fence events equivalent so are they fine? > > Pierre-Eric > >>>>> +); >>>>> + >>>>> +TRACE_EVENT(amdgpu_userq_job_queue, >>>>> + TP_PROTO(struct device *device, >>>>> + struct amdgpu_usermode_queue *queue), >>>>> + TP_ARGS(device, queue), >>>>> + TP_STRUCT__entry(__field(u64, context) >>>>> + __string(dev, dev_name(device)) >>>>> + __field(u64, doorbell_index) >>>>> + __field(u64, client_id) >>>>> + __field(u32, queue_type) >>>>> + ), >>>>> + TP_fast_assign(__assign_str(dev); >>>>> + __entry->doorbell_index = queue->doorbell_index; >>>>> + __entry->queue_type = queue->queue_type; >>>>> + __entry->client_id = queue->userq_mgr->file->client_id; >>>>> + __entry->context = queue->fence_drv->context; >>>>> + ), >>>>> + TP_printk("dev=%s, client_id=%llu, type=%u, doorbell=%llu, >>>>> context=%llu", >>>>> + __get_str(dev), __entry->client_id, __entry->queue_type, >>>>> + __entry->doorbell_index, __entry->context) >>>>> +); >>>>> + >>>>> +TRACE_EVENT(amdgpu_userq_job_add_dep, >>>>> + TP_PROTO(struct device *device, struct amdgpu_usermode_queue >>>>> *queue, struct amdgpu_userq_fence *dep), >>>>> + TP_ARGS(device, queue, dep), >>>>> + TP_STRUCT__entry( >>>>> + __field(u64, context) >>>>> + __field(u64, dep_context) >>>>> + __field(u64, dep_seqno) >>>>> + __string(dev, dev_name(device)) >>>>> + __field(u64, doorbell_index) >>>>> + __field(u64, client_id) >>>>> + __field(u32, queue_type) >>>>> + ), >>>>> + TP_fast_assign( >>>>> + __assign_str(dev); >>>>> + __entry->doorbell_index = queue->doorbell_index; >>>>> + __entry->queue_type = queue->queue_type; >>>>> + __entry->client_id = queue->userq_mgr->file->client_id; >>>>> + __entry->context = queue->fence_drv->context; >>>>> + __entry->dep_context = dep->base.context; >>>>> + __entry->dep_seqno = dep->base.seqno; >>>>> + ), >>>>> + TP_printk("dev=%s, client_id=%llu, type=%u, doorbell=%llu, >>>>> context=%llu depends on fence=%llu:%llu", >>>>> + __get_str(dev), __entry->client_id, __entry->queue_type, >>>>> __entry->doorbell_index, __entry->context, >>>>> + __entry->dep_context, >>>>> + __entry->dep_seqno) >>>>> +); >>>>> + >>>>> +TRACE_EVENT(amdgpu_userq_state_start, >>>>> + TP_PROTO(struct amdgpu_usermode_queue *queue), >>>>> + TP_ARGS(queue), >>>>> + TP_STRUCT__entry( >>>>> + __field(u64, doorbell_index) >>>>> + __field(u64, client_id) >>>>> + __field(u32, queue_type) >>>>> + __field(u32, from) >>>>> + ), >>>>> + TP_fast_assign( >>>>> + __entry->doorbell_index = queue->doorbell_index; >>>>> + __entry->queue_type = queue->queue_type; >>>>> + __entry->client_id = queue->userq_mgr->file->client_id; >>>>> + __entry->from = queue->state; >>>>> + ), >>>>> + TP_printk("client_id=%llu, type=%u, doorbell=%llu, from=%d", >>>>> + __entry->client_id, __entry->queue_type, >>>>> __entry->doorbell_index, __entry->from) >>>>> +); >>>>> + >>>>> +TRACE_EVENT(amdgpu_userq_state_changed, >>>>> + TP_PROTO(struct amdgpu_usermode_queue *queue, enum >>>>> amdgpu_userq_state new_state), >>>>> + TP_ARGS(queue, new_state), >>>>> + TP_STRUCT__entry( >>>>> + __field(u64, doorbell_index) >>>>> + __field(u64, client_id) >>>>> + __field(u32, queue_type) >>>>> + __field(u32, to) >>>>> + ), >>>>> + TP_fast_assign( >>>>> + __entry->doorbell_index = queue->doorbell_index; >>>>> + __entry->queue_type = queue->queue_type; >>>>> + __entry->client_id = queue->userq_mgr->file->client_id; >>>>> + __entry->to = new_state; >>>>> + ), >>>>> + TP_printk("client_id=%llu, type=%u, doorbell=%llu, to=%d", >>>>> + __entry->client_id, __entry->queue_type, >>>>> __entry->doorbell_index, __entry->to) >>>>> +); >>>>> + >>>>> #undef AMDGPU_JOB_GET_TIMELINE_NAME >>>>> #endif >>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c >>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c >>>>> index e27f9a76f986..60d1186af286 100644 >>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c >>>>> @@ -329,11 +329,15 @@ static int amdgpu_userq_preempt_helper(struct >>>>> amdgpu_usermode_queue *queue) >>>>> int r; >>>>> if (queue->state == AMDGPU_USERQ_STATE_MAPPED) { >>>>> + trace_amdgpu_userq_state_start(queue); >>>>> + >>>>> r = userq_funcs->preempt(queue); >>>>> if (r) { >>>>> + trace_amdgpu_userq_state_changed(queue, >>>>> AMDGPU_USERQ_STATE_HUNG); >>>>> queue->state = AMDGPU_USERQ_STATE_HUNG; >>>>> return r; >>>>> } else { >>>>> + trace_amdgpu_userq_state_changed(queue, >>>>> AMDGPU_USERQ_STATE_PREEMPTED); >>>>> queue->state = AMDGPU_USERQ_STATE_PREEMPTED; >>>>> } >>>>> } >>>>> @@ -349,10 +353,14 @@ static int amdgpu_userq_restore_helper(struct >>>>> amdgpu_usermode_queue *queue) >>>>> int r = 0; >>>>> if (queue->state == AMDGPU_USERQ_STATE_PREEMPTED) { >>>>> + trace_amdgpu_userq_state_start(queue); >>>>> + >>>>> r = userq_funcs->restore(queue); >>>>> if (r) { >>>>> + trace_amdgpu_userq_state_changed(queue, >>>>> AMDGPU_USERQ_STATE_HUNG); >>>>> queue->state = AMDGPU_USERQ_STATE_HUNG; >>>>> } else { >>>>> + trace_amdgpu_userq_state_changed(queue, >>>>> AMDGPU_USERQ_STATE_MAPPED); >>>>> queue->state = AMDGPU_USERQ_STATE_MAPPED; >>>>> } >>>>> } >>>>> @@ -370,12 +378,15 @@ static int amdgpu_userq_unmap_helper(struct >>>>> amdgpu_usermode_queue *queue) >>>>> if ((queue->state == AMDGPU_USERQ_STATE_MAPPED) || >>>>> (queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) { >>>>> + trace_amdgpu_userq_state_start(queue); >>>>> r = userq_funcs->unmap(queue); >>>>> if (r) { >>>>> + trace_amdgpu_userq_state_changed(queue, >>>>> AMDGPU_USERQ_STATE_HUNG); >>>>> queue->state = AMDGPU_USERQ_STATE_HUNG; >>>>> return r; >>>>> } else { >>>>> + trace_amdgpu_userq_state_changed(queue, >>>>> AMDGPU_USERQ_STATE_UNMAPPED); >>>>> queue->state = AMDGPU_USERQ_STATE_UNMAPPED; >>>>> } >>>>> } >>>>> @@ -392,11 +403,15 @@ static int amdgpu_userq_map_helper(struct >>>>> amdgpu_usermode_queue *queue) >>>>> int r; >>>>> if (queue->state == AMDGPU_USERQ_STATE_UNMAPPED) { >>>>> + trace_amdgpu_userq_state_start(queue); >>>>> + >>>>> r = userq_funcs->map(queue); >>>>> if (r) { >>>>> + trace_amdgpu_userq_state_changed(queue, >>>>> AMDGPU_USERQ_STATE_HUNG); >>>>> queue->state = AMDGPU_USERQ_STATE_HUNG; >>>>> return r; >>>>> } else { >>>>> + trace_amdgpu_userq_state_changed(queue, >>>>> AMDGPU_USERQ_STATE_MAPPED); >>>>> queue->state = AMDGPU_USERQ_STATE_MAPPED; >>>>> } >>>>> } >>>>> @@ -1007,6 +1022,7 @@ amdgpu_userq_restore_all(struct amdgpu_userq_mgr >>>>> *uq_mgr) >>>>> if (!amdgpu_userq_buffer_vas_mapped(queue)) { >>>>> drm_file_err(uq_mgr->file, >>>>> "trying restore queue without va mapping\n"); >>>>> + trace_amdgpu_userq_state_changed(queue, >>>>> AMDGPU_USERQ_STATE_INVALID_VA); >>>>> queue->state = AMDGPU_USERQ_STATE_INVALID_VA; >>>>> continue; >>>>> } >>>>> @@ -1502,12 +1518,14 @@ void amdgpu_userq_pre_reset(struct amdgpu_device >>>>> *adev) >>>>> if (queue->state != AMDGPU_USERQ_STATE_MAPPED) >>>>> continue; >>>>> + trace_amdgpu_userq_state_start(queue); >>>>> userq_funcs = adev->userq_funcs[queue->queue_type]; >>>>> userq_funcs->unmap(queue); >>>>> /* just mark all queues as hung at this point. >>>>> * if unmap succeeds, we could map again >>>>> * in amdgpu_userq_post_reset() if vram is not lost >>>>> */ >>>>> + trace_amdgpu_userq_state_changed(queue, AMDGPU_USERQ_STATE_HUNG); >>>>> queue->state = AMDGPU_USERQ_STATE_HUNG; >>>>> amdgpu_userq_fence_driver_force_completion(queue); >>>>> } >>>>> @@ -1526,6 +1544,8 @@ int amdgpu_userq_post_reset(struct amdgpu_device >>>>> *adev, bool vram_lost) >>>>> xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) { >>>>> if (queue->state == AMDGPU_USERQ_STATE_HUNG && !vram_lost) { >>>>> + trace_amdgpu_userq_state_start(queue); >>>>> + >>>>> userq_funcs = adev->userq_funcs[queue->queue_type]; >>>>> /* Re-map queue */ >>>>> r = userq_funcs->map(queue); >>>>> @@ -1533,6 +1553,7 @@ int amdgpu_userq_post_reset(struct amdgpu_device >>>>> *adev, bool vram_lost) >>>>> dev_err(adev->dev, "Failed to remap queue %ld\n", >>>>> queue_id); >>>>> continue; >>>>> } >>>>> + trace_amdgpu_userq_state_changed(queue, >>>>> AMDGPU_USERQ_STATE_MAPPED); >>>>> queue->state = AMDGPU_USERQ_STATE_MAPPED; >>>>> } >>>>> } >>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c >>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c >>>>> index 008330a0d852..00cc7194321c 100644 >>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c >>>>> @@ -30,7 +30,7 @@ >>>>> #include <drm/drm_syncobj.h> >>>>> #include "amdgpu.h" >>>>> -#include "amdgpu_userq_fence.h" >>>>> +#include "amdgpu_trace.h" >>>>> #define AMDGPU_USERQ_MAX_HANDLES (1U << 16) >>>>> @@ -169,6 +169,7 @@ amdgpu_userq_fence_driver_process(struct >>>>> amdgpu_userq_fence_driver *fence_drv) >>>>> fence = &userq_fence->base; >>>>> list_del_init(&userq_fence->link); >>>>> dma_fence_signal(fence); >>>>> + trace_amdgpu_userq_job_done(userq_fence); >>>>> /* Drop fence_drv_array outside fence_list_lock >>>>> * to avoid the recursion lock. >>>>> */ >>>>> @@ -528,6 +529,8 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, >>>>> void *data, >>>>> /* Create the new fence */ >>>>> amdgpu_userq_fence_init(queue, fence, wptr); >>>>> + trace_amdgpu_userq_job_run(dev->dev, queue, fence); >>>>> + >>>>> mutex_unlock(&userq_mgr->userq_mutex); >>>>> /* >>>>> @@ -701,7 +704,7 @@ amdgpu_userq_wait_add_fence(struct >>>>> drm_amdgpu_userq_wait *wait_info, >>>>> } >>>>> static int >>>>> -amdgpu_userq_wait_return_fence_info(struct drm_file *filp, >>>>> +amdgpu_userq_wait_return_fence_info(struct drm_device *dev, struct >>>>> drm_file *filp, >>>>> struct drm_amdgpu_userq_wait *wait_info, >>>>> u32 *syncobj_handles, u32 *timeline_points, >>>>> u32 *timeline_handles, >>>>> @@ -835,6 +838,8 @@ amdgpu_userq_wait_return_fence_info(struct drm_file >>>>> *filp, >>>>> goto free_fences; >>>>> } >>>>> + trace_amdgpu_userq_job_queue(dev->dev, waitq); >>>>> + >>>>> for (i = 0, cnt = 0; i < num_fences; i++) { >>>>> struct amdgpu_userq_fence_driver *fence_drv; >>>>> struct amdgpu_userq_fence *userq_fence; >>>>> @@ -869,6 +874,8 @@ amdgpu_userq_wait_return_fence_info(struct drm_file >>>>> *filp, >>>>> amdgpu_userq_fence_driver_get(fence_drv); >>>>> + trace_amdgpu_userq_job_add_dep(dev->dev, waitq, userq_fence); >>>>> + >>>>> /* Store drm syncobj's gpu va address and value */ >>>>> fence_info[cnt].va = fence_drv->va; >>>>> fence_info[cnt].value = fences[i]->seqno; >>>>> @@ -968,7 +975,7 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, >>>>> void *data, >>>>> gobj_write, >>>>> gobj_read); >>>>> } else { >>>>> - r = amdgpu_userq_wait_return_fence_info(filp, wait_info, >>>>> + r = amdgpu_userq_wait_return_fence_info(dev, filp, wait_info, >>>>> syncobj_handles, >>>>> timeline_points, >>>>> timeline_handles, >>
