Hi Maíra, this pactch looks good to me but I am wondering if we should do anything to clean up the perfmon state when a gpu reset happens: i.e. what would happen if a perfmon job resets? Would we continue to schedule after-reset jobs to wait on the fence of the reset job?
Iago El dom, 31-05-2026 a las 17:18 -0300, Maíra Canal escribió: > A non-global perfmon is meant to count events generated by a specific > submission, but the scheduler can run jobs from different queues > concurrently on the same V3D core. Without explicit serialization, an > unrelated job running in parallel with a perfmon-carrying job > pollutes > the counters and generates unusable results. > > To address such issue, we must enforce cross-queue serialization when > we > detect a perfmon-carrying submission. It's possible to implement > serialization by enforcing two rules: > > 1. A job that carries a non-global perfmon must wait for every job > currently in-flight across all HW queues to finish. > > 2. While a perfmon-carrying job is still in-flight, all > subsequently > submitted jobs must wait for it. > > Note that serialization is not needed in the global perfmon case, as > the > global perfmon tracks activity from all jobs, so concurrency is > desirable. > > Therefore, check if serialization is needed during job submission and > if > so, attach fence dependences to enforce cross-queue serialization. > > Signed-off-by: Maíra Canal <[email protected]> > --- > drivers/gpu/drm/v3d/v3d_drv.h | 35 ++++++++++++++++++---- > drivers/gpu/drm/v3d/v3d_gem.c | 3 ++ > drivers/gpu/drm/v3d/v3d_perfmon.c | 6 ++++ > drivers/gpu/drm/v3d/v3d_submit.c | 63 > +++++++++++++++++++++++++++++++++++++++ > 4 files changed, 101 insertions(+), 6 deletions(-) > > diff --git a/drivers/gpu/drm/v3d/v3d_drv.h > b/drivers/gpu/drm/v3d/v3d_drv.h > index 51486af68cf4..cdf4926d51f2 100644 > --- a/drivers/gpu/drm/v3d/v3d_drv.h > +++ b/drivers/gpu/drm/v3d/v3d_drv.h > @@ -74,11 +74,13 @@ struct v3d_queue_state { > spinlock_t queue_lock; > }; > > -/* Performance monitor object. The perform lifetime is controlled by > userspace > - * using perfmon related ioctls. A perfmon can be attached to a > submit_cl > - * request, and when this is the case, HW perf counters will be > activated just > - * before the submit_cl is submitted to the GPU and disabled when > the job is > - * done. This way, only events related to a specific job will be > counted. > +/* Performance monitor object > + * > + * The performance monitor (perfmon) lifetime is controlled by > userspace using > + * perfmon related ioctls. A perfmon can be attached to a CL or CSD > submission > + * request, and when it is, HW performance counters will be > activated just > + * before the job is submitted to the GPU and disabled when the job > is done. > + * This way, only events related to a specific submission will be > counted. > */ > struct v3d_perfmon { > /* Tracks the number of users of the perfmon, when this > counter reaches > @@ -167,13 +169,31 @@ struct v3d_dev { > > struct v3d_queue_state queue[V3D_MAX_QUEUES]; > > - /* Tracks the performance monitor state. */ > + /* > + * Tracks the performance monitor state and consistency. > + * > + * When a non-global perfmon is attached to a job, the > scheduler must > + * not run any other job on the HW concurrently (otherwise, > the > + * counters would be polluted by unrelated work). > + */ > struct { > /* Protects @active. */ > spinlock_t lock; > > /* Perfmon currently programmed in HW (or NULL if > none). */ > struct v3d_perfmon *active; > + > + /* Finished fence of the most recently submitted job > that > + * opened a serialization window (i.e. a job with a > non-global > + * perfmon attached). > + */ > + struct dma_fence *fence; > + > + /* Finished fence of the most recently submitted job > on each HW > + * queue. Used so that a new perfmon-carrying job > can depend on > + * every job currently in-flight across all queues. > + */ > + struct dma_fence *last_hw_fence[V3D_MAX_QUEUES]; > } perfmon_state; > > /* Protects bo_stats */ > @@ -319,6 +339,9 @@ struct v3d_job { > > struct v3d_dev *v3d; > > + /* The queue that the job was submitted on. */ > + enum v3d_queue queue; > + > /* This is the array of BOs that were looked up at the start > * of submission. > */ > diff --git a/drivers/gpu/drm/v3d/v3d_gem.c > b/drivers/gpu/drm/v3d/v3d_gem.c > index 9487ab7acd03..0b415486f5b4 100644 > --- a/drivers/gpu/drm/v3d/v3d_gem.c > +++ b/drivers/gpu/drm/v3d/v3d_gem.c > @@ -363,8 +363,11 @@ v3d_gem_destroy(struct drm_device *dev) > for (q = 0; q < V3D_MAX_QUEUES; q++) { > WARN_ON(v3d->queue[q].active_job); > v3d_stats_put(v3d->queue[q].stats); > + dma_fence_put(v3d->perfmon_state.last_hw_fence[q]); > } > > + dma_fence_put(v3d->perfmon_state.fence); > + > drm_mm_takedown(&v3d->mm); > > dma_free_coherent(v3d->drm.dev, 4096 * 1024, (void *)v3d- > >pt, > diff --git a/drivers/gpu/drm/v3d/v3d_perfmon.c > b/drivers/gpu/drm/v3d/v3d_perfmon.c > index 3ad0f022753c..07dab7fb3060 100644 > --- a/drivers/gpu/drm/v3d/v3d_perfmon.c > +++ b/drivers/gpu/drm/v3d/v3d_perfmon.c > @@ -275,6 +275,12 @@ void v3d_perfmon_start(struct v3d_dev *v3d, > struct v3d_perfmon *perfmon) > if (!perfmon || v3d->global_perfmon) > return; > > + /* Cross-queue serialization should have drained any > previous perfmon > + * job before this one runs. > + */ > + if (WARN_ON_ONCE(v3d->perfmon_state.active)) > + return; > + > if (!pm_runtime_get_if_active(v3d->drm.dev)) > return; > > diff --git a/drivers/gpu/drm/v3d/v3d_submit.c > b/drivers/gpu/drm/v3d/v3d_submit.c > index 4c526aafc4e0..46fc88bacc95 100644 > --- a/drivers/gpu/drm/v3d/v3d_submit.c > +++ b/drivers/gpu/drm/v3d/v3d_submit.c > @@ -234,6 +234,7 @@ v3d_submit_add_job(struct v3d_submit *submit, > void **container, size_t size, > job->v3d = v3d; > job->free = free; > job->file_priv = v3d_priv; > + job->queue = queue; > > ret = drm_sched_job_init(&job->base, &v3d_priv- > >sched_entity[queue], > 1, v3d_priv, submit->file_priv- > >client_id); > @@ -293,6 +294,62 @@ v3d_attach_perfmon_to_jobs(struct v3d_submit > *submit, u32 perfmon_id) > return 0; > } > > +/* > + * Prepare fences to enforce job serialization when a perfmon is > active. A job > + * that carries a non-global perfmon must wait for every job > currently in-flight > + * across all HW queues to finish, otherwise concurrent unrelated > work on the > + * same core would pollute the performance counters. Symmetrically, > while such a > + * job is still in-flight, all subsequently submitted jobs must wait > for it. > + * > + * We don't serialize the jobs when using a global perfmon as it's > expected to > + * track concurrent activity from all jobs. > + */ > +static int > +v3d_serialize_for_perfmon(struct v3d_job *job) > +{ > + struct v3d_dev *v3d = job->v3d; > + bool is_global_perfmon; > + int ret; > + > + lockdep_assert_held(&v3d->sched_lock); > + > + scoped_guard(spinlock_irqsave, &v3d->perfmon_state.lock) > + is_global_perfmon = !!v3d->global_perfmon; > + > + if (is_global_perfmon) > + goto publish; > + > + if (job->perfmon) { > + for (enum v3d_queue q = 0; q < V3D_MAX_QUEUES; q++) > { > + struct dma_fence *f = v3d- > >perfmon_state.last_hw_fence[q]; > + > + if (!f || dma_fence_is_signaled(f)) > + continue; > + > + ret = drm_sched_job_add_dependency(&job- > >base, dma_fence_get(f)); > + if (ret) > + return ret; > + } > + } else if (v3d->perfmon_state.fence && > + !dma_fence_is_signaled(v3d->perfmon_state.fence)) > { > + ret = drm_sched_job_add_dependency(&job->base, > + > dma_fence_get(v3d->perfmon_state.fence)); > + if (ret) > + return ret; > + } > + > +publish: > + dma_fence_put(v3d->perfmon_state.last_hw_fence[job->queue]); > + v3d->perfmon_state.last_hw_fence[job->queue] = > dma_fence_get(job->done_fence); > + > + if (job->perfmon && !is_global_perfmon) { > + dma_fence_put(v3d->perfmon_state.fence); > + v3d->perfmon_state.fence = dma_fence_get(job- > >done_fence); > + } > + > + return 0; > +} > + > static void > v3d_submit_attach_object_fences(struct v3d_submit *submit) > { > @@ -337,6 +394,12 @@ v3d_submit_jobs(struct v3d_submit *submit) > goto err; > } > > + for (int i = 0; i < submit->job_count; i++) { > + ret = v3d_serialize_for_perfmon(submit->jobs[i]); > + if (ret) > + goto err; > + } > + > for (int i = 0; i < submit->job_count; i++) > drm_sched_entity_push_job(&submit->jobs[i]->base); > >
