A non-global perfmon is meant to count events generated by a specific
submission, but the scheduler can run jobs from different queues
concurrently on the same V3D core. Without explicit serialization, an
unrelated job running in parallel with a perfmon-carrying job pollutes
the counters and generates unusable results.
To address such issue, we must enforce cross-queue serialization when we
detect a perfmon-carrying submission. It's possible to implement
serialization by enforcing two rules:
1. A job that carries a non-global perfmon must wait for every job
currently in-flight across all HW queues to finish.
2. While a perfmon-carrying job is still in-flight, all subsequently
submitted jobs must wait for it.
Note that serialization is not needed in the global perfmon case, as the
global perfmon tracks activity from all jobs, so concurrency is desirable.
Therefore, check if serialization is needed during job submission and if
so, attach fence dependences to enforce cross-queue serialization.
Signed-off-by: Maíra Canal <[email protected]>
---
drivers/gpu/drm/v3d/v3d_drv.h | 32 ++++++++++++---
drivers/gpu/drm/v3d/v3d_gem.c | 3 ++
drivers/gpu/drm/v3d/v3d_perfmon.c | 3 ++
drivers/gpu/drm/v3d/v3d_submit.c | 83 +++++++++++++++++++++++++++++++++++----
4 files changed, 107 insertions(+), 14 deletions(-)
diff --git a/drivers/gpu/drm/v3d/v3d_drv.h b/drivers/gpu/drm/v3d/v3d_drv.h
index 3a7801348697..cdf4926d51f2 100644
--- a/drivers/gpu/drm/v3d/v3d_drv.h
+++ b/drivers/gpu/drm/v3d/v3d_drv.h
@@ -74,11 +74,13 @@ struct v3d_queue_state {
spinlock_t queue_lock;
};
-/* Performance monitor object. The perform lifetime is controlled by userspace
- * using perfmon related ioctls. A perfmon can be attached to a submit_cl
- * request, and when this is the case, HW perf counters will be activated just
- * before the submit_cl is submitted to the GPU and disabled when the job is
- * done. This way, only events related to a specific job will be counted.
+/* Performance monitor object
+ *
+ * The performance monitor (perfmon) lifetime is controlled by userspace using
+ * perfmon related ioctls. A perfmon can be attached to a CL or CSD submission
+ * request, and when it is, HW performance counters will be activated just
+ * before the job is submitted to the GPU and disabled when the job is done.
+ * This way, only events related to a specific submission will be counted.
*/
struct v3d_perfmon {
/* Tracks the number of users of the perfmon, when this counter reaches
@@ -167,13 +169,31 @@ struct v3d_dev {
struct v3d_queue_state queue[V3D_MAX_QUEUES];
- /* Tracks the performance monitor state. */
+ /*
+ * Tracks the performance monitor state and consistency.
+ *
+ * When a non-global perfmon is attached to a job, the scheduler must
+ * not run any other job on the HW concurrently (otherwise, the
+ * counters would be polluted by unrelated work).
+ */
struct {
/* Protects @active. */
spinlock_t lock;
/* Perfmon currently programmed in HW (or NULL if none). */
struct v3d_perfmon *active;
+
+ /* Finished fence of the most recently submitted job that
+ * opened a serialization window (i.e. a job with a non-global
+ * perfmon attached).
+ */
+ struct dma_fence *fence;
+
+ /* Finished fence of the most recently submitted job on each HW
+ * queue. Used so that a new perfmon-carrying job can depend on
+ * every job currently in-flight across all queues.
+ */
+ struct dma_fence *last_hw_fence[V3D_MAX_QUEUES];
} perfmon_state;
/* Protects bo_stats */
diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c
index 6e387c41fbee..80fa7b3bde61 100644
--- a/drivers/gpu/drm/v3d/v3d_gem.c
+++ b/drivers/gpu/drm/v3d/v3d_gem.c
@@ -362,8 +362,11 @@ v3d_gem_destroy(struct drm_device *dev)
for (q = 0; q < V3D_MAX_QUEUES; q++) {
WARN_ON(v3d->queue[q].active_job);
v3d_stats_put(v3d->queue[q].stats);
+ dma_fence_put(v3d->perfmon_state.last_hw_fence[q]);
}
+ dma_fence_put(v3d->perfmon_state.fence);
+
drm_mm_takedown(&v3d->mm);
dma_free_coherent(v3d->drm.dev, 4096 * 1024, (void *)v3d->pt,
diff --git a/drivers/gpu/drm/v3d/v3d_perfmon.c
b/drivers/gpu/drm/v3d/v3d_perfmon.c
index 4f1c59e282b5..006eff007aa3 100644
--- a/drivers/gpu/drm/v3d/v3d_perfmon.c
+++ b/drivers/gpu/drm/v3d/v3d_perfmon.c
@@ -273,6 +273,9 @@ void v3d_perfmon_start(struct v3d_dev *v3d, struct
v3d_perfmon *perfmon)
if (perfmon == v3d->perfmon_state.active)
return;
+ if (WARN_ON_ONCE(v3d->perfmon_state.active))
+ return;
+
if (!pm_runtime_get_if_active(v3d->drm.dev))
return;
diff --git a/drivers/gpu/drm/v3d/v3d_submit.c b/drivers/gpu/drm/v3d/v3d_submit.c
index 1047aca58282..9ce86cb3edd8 100644
--- a/drivers/gpu/drm/v3d/v3d_submit.c
+++ b/drivers/gpu/drm/v3d/v3d_submit.c
@@ -271,11 +271,67 @@ v3d_attach_perfmon_to_jobs(struct v3d_submit *submit, u32
perfmon_id)
return 0;
}
+/*
+ * Prepare fences to enforce job serialization when a perfmon is active. A job
+ * that carries a non-global perfmon must wait for every job currently
in-flight
+ * across all HW queues to finish, otherwise concurrent unrelated work on the
+ * same core would pollute the performance counters. Symmetrically, while such
a
+ * job is still in-flight, all subsequently submitted jobs must wait for it.
+ *
+ * We don't serialize the jobs when using a global perfmon as it's expected to
+ * track concurrent activity from all jobs.
+ */
+static int
+v3d_serialize_for_perfmon(struct v3d_job *job)
+{
+ struct v3d_dev *v3d = job->v3d;
+ bool is_global_perfmon;
+ int ret;
+
+ lockdep_assert_held(&v3d->sched_lock);
+
+ scoped_guard(spinlock_irqsave, &v3d->perfmon_state.lock)
+ is_global_perfmon = !!v3d->global_perfmon;
+
+ if (is_global_perfmon)
+ goto publish;
+
+ if (job->perfmon) {
+ for (enum v3d_queue q = 0; q < V3D_MAX_QUEUES; q++) {
+ struct dma_fence *f =
v3d->perfmon_state.last_hw_fence[q];
+
+ if (!f || dma_fence_is_signaled(f))
+ continue;
+
+ ret = drm_sched_job_add_dependency(&job->base,
dma_fence_get(f));
+ if (ret)
+ return ret;
+ }
+ } else if (v3d->perfmon_state.fence &&
+ !dma_fence_is_signaled(v3d->perfmon_state.fence)) {
+ ret = drm_sched_job_add_dependency(&job->base,
+
dma_fence_get(v3d->perfmon_state.fence));
+ if (ret)
+ return ret;
+ }
+
+publish:
+ dma_fence_put(v3d->perfmon_state.last_hw_fence[job->queue]);
+ v3d->perfmon_state.last_hw_fence[job->queue] =
dma_fence_get(job->done_fence);
+
+ if (job->perfmon && !is_global_perfmon) {
+ dma_fence_put(v3d->perfmon_state.fence);
+ v3d->perfmon_state.fence = dma_fence_get(job->done_fence);
+ }
+
+ return 0;
+}
+
static int
v3d_submit_jobs(struct v3d_submit *submit)
{
struct v3d_dev *v3d = submit->v3d;
- int i, j, ret = 0;
+ int i, ret = 0;
guard(mutex)(&v3d->sched_lock);
@@ -292,15 +348,26 @@ v3d_submit_jobs(struct v3d_submit *submit)
for (i = 0; i + 1 < submit->job_count; i++) {
ret = drm_sched_job_add_dependency(&submit->jobs[i + 1]->base,
dma_fence_get(submit->jobs[i]->done_fence));
- if (ret) {
- /* Mark every armed job as failed so run_job() skips
execution */
- for (j = 0; j < submit->job_count; j++)
-
dma_fence_set_error(&submit->jobs[j]->base.s_fence->finished,
- ret);
- break;
- }
+ if (ret)
+ goto err;
}
+ for (i = 0; i < submit->job_count; i++) {
+ ret = v3d_serialize_for_perfmon(submit->jobs[i]);
+ if (ret)
+ goto err;
+ }
+
+ for (i = 0; i < submit->job_count; i++)
+ drm_sched_entity_push_job(&submit->jobs[i]->base);
+
+ return 0;
+
+err:
+ /* Mark every armed job as failed so run_job() skips execution */
+ for (i = 0; i < submit->job_count; i++)
+ dma_fence_set_error(&submit->jobs[i]->base.s_fence->finished,
ret);
+
for (i = 0; i < submit->job_count; i++)
drm_sched_entity_push_job(&submit->jobs[i]->base);
--
2.54.0