Hi Iago,

On 02/06/26 04:35, Iago Toral wrote:
Hi Maíra, this pactch looks good to me but I am wondering if we should
do anything to clean up the perfmon state when a gpu reset happens:
i.e. what would happen if a perfmon job resets? Would we continue to
schedule after-reset jobs to wait on the fence of the reset job?

If the job resets, the design around the DRM scheduler will protect us.

The DRM scheduler guarantees that a finished fence is always signaled
eventually. On a reset, drm_sched_start() signals the pending jobs'
finished fences (with -ECANCELED for the guilty job). So if a
perfmon-carrying job is the one that times out, the done_fence we
stored for serialization still gets signaled when resubmitted (check the
first if-clause in the run_job() callbacks).

drm_sched_job_add_dependency() releases the dependent job as soon as the
fence signals, regardless of error, so any job submitted behind the
reset perfmon job is released and runs. The serialization window simply
closes when the job dies.

Best regards,
- Maíra


Iago

El dom, 31-05-2026 a las 17:18 -0300, Maíra Canal escribió:
A non-global perfmon is meant to count events generated by a specific
submission, but the scheduler can run jobs from different queues
concurrently on the same V3D core. Without explicit serialization, an
unrelated job running in parallel with a perfmon-carrying job
pollutes
the counters and generates unusable results.

To address such issue, we must enforce cross-queue serialization when
we
detect a perfmon-carrying submission. It's possible to implement
serialization by enforcing two rules:

   1. A job that carries a non-global perfmon must wait for every job
     currently in-flight across all HW queues to finish.

   2. While a perfmon-carrying job is still in-flight, all
subsequently
     submitted jobs must wait for it.

Note that serialization is not needed in the global perfmon case, as
the
global perfmon tracks activity from all jobs, so concurrency is
desirable.

Therefore, check if serialization is needed during job submission and
if
so, attach fence dependences to enforce cross-queue serialization.

Signed-off-by: Maíra Canal <[email protected]>
---
  drivers/gpu/drm/v3d/v3d_drv.h     | 35 ++++++++++++++++++----
  drivers/gpu/drm/v3d/v3d_gem.c     |  3 ++
  drivers/gpu/drm/v3d/v3d_perfmon.c |  6 ++++
  drivers/gpu/drm/v3d/v3d_submit.c  | 63
+++++++++++++++++++++++++++++++++++++++
  4 files changed, 101 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/v3d/v3d_drv.h
b/drivers/gpu/drm/v3d/v3d_drv.h
index 51486af68cf4..cdf4926d51f2 100644
--- a/drivers/gpu/drm/v3d/v3d_drv.h
+++ b/drivers/gpu/drm/v3d/v3d_drv.h
@@ -74,11 +74,13 @@ struct v3d_queue_state {
        spinlock_t queue_lock;
  };
-/* Performance monitor object. The perform lifetime is controlled by
userspace
- * using perfmon related ioctls. A perfmon can be attached to a
submit_cl
- * request, and when this is the case, HW perf counters will be
activated just
- * before the submit_cl is submitted to the GPU and disabled when
the job is
- * done. This way, only events related to a specific job will be
counted.
+/* Performance monitor object
+ *
+ * The performance monitor (perfmon) lifetime is controlled by
userspace using
+ * perfmon related ioctls. A perfmon can be attached to a CL or CSD
submission
+ * request, and when it is, HW performance counters will be
activated just
+ * before the job is submitted to the GPU and disabled when the job
is done.
+ * This way, only events related to a specific submission will be
counted.
   */
  struct v3d_perfmon {
        /* Tracks the number of users of the perfmon, when this
counter reaches
@@ -167,13 +169,31 @@ struct v3d_dev {
  struct v3d_queue_state queue[V3D_MAX_QUEUES]; - /* Tracks the performance monitor state. */
+       /*
+        * Tracks the performance monitor state and consistency.
+        *
+        * When a non-global perfmon is attached to a job, the
scheduler must
+        * not run any other job on the HW concurrently (otherwise,
the
+        * counters would be polluted by unrelated work).
+        */
        struct {
                /* Protects @active. */
                spinlock_t lock;
  /* Perfmon currently programmed in HW (or NULL if
none). */
                struct v3d_perfmon *active;
+
+               /* Finished fence of the most recently submitted job
that
+                * opened a serialization window (i.e. a job with a
non-global
+                * perfmon attached).
+                */
+               struct dma_fence *fence;
+
+               /* Finished fence of the most recently submitted job
on each HW
+                * queue. Used so that a new perfmon-carrying job
can depend on
+                * every job currently in-flight across all queues.
+                */
+               struct dma_fence *last_hw_fence[V3D_MAX_QUEUES];
        } perfmon_state;
  /* Protects bo_stats */
@@ -319,6 +339,9 @@ struct v3d_job {
  struct v3d_dev *v3d; + /* The queue that the job was submitted on. */
+       enum v3d_queue queue;
+
        /* This is the array of BOs that were looked up at the start
         * of submission.
         */
diff --git a/drivers/gpu/drm/v3d/v3d_gem.c
b/drivers/gpu/drm/v3d/v3d_gem.c
index 9487ab7acd03..0b415486f5b4 100644
--- a/drivers/gpu/drm/v3d/v3d_gem.c
+++ b/drivers/gpu/drm/v3d/v3d_gem.c
@@ -363,8 +363,11 @@ v3d_gem_destroy(struct drm_device *dev)
        for (q = 0; q < V3D_MAX_QUEUES; q++) {
                WARN_ON(v3d->queue[q].active_job);
                v3d_stats_put(v3d->queue[q].stats);
+               dma_fence_put(v3d->perfmon_state.last_hw_fence[q]);
        }
+ dma_fence_put(v3d->perfmon_state.fence);
+
        drm_mm_takedown(&v3d->mm);
  dma_free_coherent(v3d->drm.dev, 4096 * 1024, (void *)v3d-
pt,
diff --git a/drivers/gpu/drm/v3d/v3d_perfmon.c
b/drivers/gpu/drm/v3d/v3d_perfmon.c
index 3ad0f022753c..07dab7fb3060 100644
--- a/drivers/gpu/drm/v3d/v3d_perfmon.c
+++ b/drivers/gpu/drm/v3d/v3d_perfmon.c
@@ -275,6 +275,12 @@ void v3d_perfmon_start(struct v3d_dev *v3d,
struct v3d_perfmon *perfmon)
        if (!perfmon || v3d->global_perfmon)
                return;
+ /* Cross-queue serialization should have drained any
previous perfmon
+        * job before this one runs.
+        */
+       if (WARN_ON_ONCE(v3d->perfmon_state.active))
+               return;
+
        if (!pm_runtime_get_if_active(v3d->drm.dev))
                return;
diff --git a/drivers/gpu/drm/v3d/v3d_submit.c
b/drivers/gpu/drm/v3d/v3d_submit.c
index 4c526aafc4e0..46fc88bacc95 100644
--- a/drivers/gpu/drm/v3d/v3d_submit.c
+++ b/drivers/gpu/drm/v3d/v3d_submit.c
@@ -234,6 +234,7 @@ v3d_submit_add_job(struct v3d_submit *submit,
void **container, size_t size,
        job->v3d = v3d;
        job->free = free;
        job->file_priv = v3d_priv;
+       job->queue = queue;
  ret = drm_sched_job_init(&job->base, &v3d_priv-
sched_entity[queue],
                                 1, v3d_priv, submit->file_priv-
client_id);
@@ -293,6 +294,62 @@ v3d_attach_perfmon_to_jobs(struct v3d_submit
*submit, u32 perfmon_id)
        return 0;
  }
+/*
+ * Prepare fences to enforce job serialization when a perfmon is
active. A job
+ * that carries a non-global perfmon must wait for every job
currently in-flight
+ * across all HW queues to finish, otherwise concurrent unrelated
work on the
+ * same core would pollute the performance counters. Symmetrically,
while such a
+ * job is still in-flight, all subsequently submitted jobs must wait
for it.
+ *
+ * We don't serialize the jobs when using a global perfmon as it's
expected to
+ * track concurrent activity from all jobs.
+ */
+static int
+v3d_serialize_for_perfmon(struct v3d_job *job)
+{
+       struct v3d_dev *v3d = job->v3d;
+       bool is_global_perfmon;
+       int ret;
+
+       lockdep_assert_held(&v3d->sched_lock);
+
+       scoped_guard(spinlock_irqsave, &v3d->perfmon_state.lock)
+               is_global_perfmon = !!v3d->global_perfmon;
+
+       if (is_global_perfmon)
+               goto publish;
+
+       if (job->perfmon) {
+               for (enum v3d_queue q = 0; q < V3D_MAX_QUEUES; q++)
{
+                       struct dma_fence *f = v3d-
perfmon_state.last_hw_fence[q];
+
+                       if (!f || dma_fence_is_signaled(f))
+                               continue;
+
+                       ret = drm_sched_job_add_dependency(&job-
base, dma_fence_get(f));
+                       if (ret)
+                               return ret;
+               }
+       } else if (v3d->perfmon_state.fence &&
+                  !dma_fence_is_signaled(v3d->perfmon_state.fence))
{
+               ret = drm_sched_job_add_dependency(&job->base,
+                                               
dma_fence_get(v3d->perfmon_state.fence));
+               if (ret)
+                       return ret;
+       }
+
+publish:
+       dma_fence_put(v3d->perfmon_state.last_hw_fence[job->queue]);
+       v3d->perfmon_state.last_hw_fence[job->queue] =
dma_fence_get(job->done_fence);
+
+       if (job->perfmon && !is_global_perfmon) {
+               dma_fence_put(v3d->perfmon_state.fence);
+               v3d->perfmon_state.fence = dma_fence_get(job-
done_fence);
+       }
+
+       return 0;
+}
+
  static void
  v3d_submit_attach_object_fences(struct v3d_submit *submit)
  {
@@ -337,6 +394,12 @@ v3d_submit_jobs(struct v3d_submit *submit)
                        goto err;
        }
+ for (int i = 0; i < submit->job_count; i++) {
+               ret = v3d_serialize_for_perfmon(submit->jobs[i]);
+               if (ret)
+                       goto err;
+       }
+
        for (int i = 0; i < submit->job_count; i++)
                drm_sched_entity_push_job(&submit->jobs[i]->base);


Reply via email to