v3d exposes a single set of performance counters per core, so at any
moment at most one performance monitor can be programmed in HW. In
software, this singleton is represented by v3d_dev->active_perfmon, but
until now nothing actually serialized access to it: scheduler callbacks,
the GPU-reset path, and perfmon ioctls all read and wrote that field
lock-free.

The existence of v3d_perfmon->lock mutex did not close the gap. It
serialized start/stop of *one* perfmon object against itself, but the
invariant that needs protection is device-wide: there can be exactly one
active perfmon at any moment in HW. Two threads acting on different
perfmon objects could race through v3d_dev->active_perfmon and the
counter registers, leaving software and HW out of sync.

This commit moves the locking to where the invariant actually lives. Group
the active perfmon pointer with a device-wide spinlock and route every
state transition (job start, job completion, set global, reset,
suspend/resume, destruction) through a small set of locked entry points
that are the only mutators of the HW counters.

Using a spinlock also unlocks two design improvements: stopping the
perfmon from the IRQ handler at job-completion time (the natural boundary
for "active perfmon follows the active job"), and pausing/resuming the HW
counters across runtime-PM transitions without dropping the software
reference. This solves yet another issue of the current design: perfmon
start/stop was exclusively attached to run_job() callbacks, which means
that if nothing was further queued up, a perfmon would never actually
stopped.

Signed-off-by: Maíra Canal <[email protected]>
---
 drivers/gpu/drm/v3d/v3d_drv.h     |  20 ++++--
 drivers/gpu/drm/v3d/v3d_gem.c     |   3 +-
 drivers/gpu/drm/v3d/v3d_irq.c     |   6 ++
 drivers/gpu/drm/v3d/v3d_perfmon.c | 145 ++++++++++++++++++++++++++------------
 drivers/gpu/drm/v3d/v3d_power.c   |   4 ++
 drivers/gpu/drm/v3d/v3d_sched.c   |  24 +------
 drivers/gpu/drm/v3d/v3d_submit.c  |   7 +-
 7 files changed, 136 insertions(+), 73 deletions(-)

diff --git a/drivers/gpu/drm/v3d/v3d_drv.h b/drivers/gpu/drm/v3d/v3d_drv.h
index 071d919fe860..3a7801348697 100644
--- a/drivers/gpu/drm/v3d/v3d_drv.h
+++ b/drivers/gpu/drm/v3d/v3d_drv.h
@@ -86,9 +86,6 @@ struct v3d_perfmon {
         */
        refcount_t refcnt;
 
-       /* Protects perfmon stop, as it can be invoked from multiple places. */
-       struct mutex lock;
-
        /* Number of counters activated in this perfmon instance
         * (should be less than DRM_V3D_MAX_PERF_COUNTERS).
         */
@@ -170,8 +167,14 @@ struct v3d_dev {
 
        struct v3d_queue_state queue[V3D_MAX_QUEUES];
 
-       /* Used to track the active perfmon if any. */
-       struct v3d_perfmon *active_perfmon;
+       /* Tracks the performance monitor state. */
+       struct {
+               /* Protects @active. */
+               spinlock_t lock;
+
+               /* Perfmon currently programmed in HW (or NULL if none). */
+               struct v3d_perfmon *active;
+       } perfmon_state;
 
        /* Protects bo_stats */
        struct mutex bo_lock;
@@ -316,6 +319,9 @@ struct v3d_job {
 
        struct v3d_dev *v3d;
 
+       /* The queue that the job was submitted on. */
+       enum v3d_queue queue;
+
        /* This is the array of BOs that were looked up at the start
         * of submission.
         */
@@ -663,6 +669,10 @@ void v3d_perfmon_put(struct v3d_perfmon *perfmon);
 void v3d_perfmon_start(struct v3d_dev *v3d, struct v3d_perfmon *perfmon);
 void v3d_perfmon_stop(struct v3d_dev *v3d, struct v3d_perfmon *perfmon,
                      bool capture);
+void v3d_perfmon_stop_locked(struct v3d_dev *v3d, struct v3d_perfmon *perfmon,
+                            bool capture);
+void v3d_perfmon_suspend(struct v3d_dev *v3d);
+void v3d_perfmon_resume(struct v3d_dev *v3d);
 struct v3d_perfmon *v3d_perfmon_find(struct v3d_file_priv *v3d_priv, int id);
 void v3d_perfmon_open_file(struct v3d_file_priv *v3d_priv);
 void v3d_perfmon_close_file(struct v3d_file_priv *v3d_priv);
diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c
index 1ee3c038d5f6..6e387c41fbee 100644
--- a/drivers/gpu/drm/v3d/v3d_gem.c
+++ b/drivers/gpu/drm/v3d/v3d_gem.c
@@ -137,7 +137,7 @@ v3d_reset(struct v3d_dev *v3d)
        v3d_mmu_set_page_table(v3d);
        v3d_irq_reset(v3d);
 
-       v3d_perfmon_stop(v3d, v3d->active_perfmon, false);
+       v3d_perfmon_stop(v3d, v3d->perfmon_state.active, false);
 
        trace_v3d_reset_end(dev);
 }
@@ -299,6 +299,7 @@ v3d_gem_init(struct drm_device *dev)
        }
 
        spin_lock_init(&v3d->mm_lock);
+       spin_lock_init(&v3d->perfmon_state.lock);
        ret = drmm_mutex_init(dev, &v3d->bo_lock);
        if (ret)
                goto err_stats;
diff --git a/drivers/gpu/drm/v3d/v3d_irq.c b/drivers/gpu/drm/v3d/v3d_irq.c
index d4d7c39227be..9b446672c424 100644
--- a/drivers/gpu/drm/v3d/v3d_irq.c
+++ b/drivers/gpu/drm/v3d/v3d_irq.c
@@ -92,6 +92,12 @@ v3d_irq_signal_fence(struct v3d_dev *v3d, enum v3d_queue q,
        struct v3d_fence *fence = to_v3d_fence(queue->active_job->irq_fence);
 
        v3d_job_update_stats(queue->active_job);
+
+       scoped_guard(spinlock_irqsave, &v3d->perfmon_state.lock) {
+               if (v3d->perfmon_state.active != v3d->global_perfmon)
+                       v3d_perfmon_stop_locked(v3d, v3d->perfmon_state.active, 
true);
+       }
+
        trace_irq(&v3d->drm, fence->seqno);
 
        queue->active_job = NULL;
diff --git a/drivers/gpu/drm/v3d/v3d_perfmon.c 
b/drivers/gpu/drm/v3d/v3d_perfmon.c
index 02451fc09dbb..4f1c59e282b5 100644
--- a/drivers/gpu/drm/v3d/v3d_perfmon.c
+++ b/drivers/gpu/drm/v3d/v3d_perfmon.c
@@ -217,26 +217,15 @@ void v3d_perfmon_get(struct v3d_perfmon *perfmon)
 
 void v3d_perfmon_put(struct v3d_perfmon *perfmon)
 {
-       if (perfmon && refcount_dec_and_test(&perfmon->refcnt)) {
-               mutex_destroy(&perfmon->lock);
+       if (perfmon && refcount_dec_and_test(&perfmon->refcnt))
                kfree(perfmon);
-       }
 }
 
-void v3d_perfmon_start(struct v3d_dev *v3d, struct v3d_perfmon *perfmon)
+static void v3d_perfmon_hw_start(struct v3d_dev *v3d, struct v3d_perfmon 
*perfmon)
 {
+       u8 ncounters = perfmon->ncounters;
+       u32 mask = GENMASK(ncounters - 1, 0);
        unsigned int i;
-       u32 mask;
-       u8 ncounters;
-
-       if (WARN_ON_ONCE(!perfmon || v3d->active_perfmon))
-               return;
-
-       if (!pm_runtime_get_if_active(v3d->drm.dev))
-               return;
-
-       ncounters = perfmon->ncounters;
-       mask = GENMASK(ncounters - 1, 0);
 
        for (i = 0; i < ncounters; i++) {
                u32 source = i / 4;
@@ -258,8 +247,56 @@ void v3d_perfmon_start(struct v3d_dev *v3d, struct 
v3d_perfmon *perfmon)
        V3D_CORE_WRITE(0, V3D_V4_PCTR_0_EN, mask);
        V3D_CORE_WRITE(0, V3D_V4_PCTR_0_CLR, mask);
        V3D_CORE_WRITE(0, V3D_PCTR_0_OVERFLOW, mask);
+}
 
-       v3d->active_perfmon = perfmon;
+static void v3d_perfmon_hw_stop(struct v3d_dev *v3d, struct v3d_perfmon 
*perfmon,
+                               bool capture)
+{
+       unsigned int i;
+
+       if (capture)
+               for (i = 0; i < perfmon->ncounters; i++)
+                       perfmon->values[i] += V3D_CORE_READ(0, 
V3D_PCTR_0_PCTRX(i));
+
+       V3D_CORE_WRITE(0, V3D_V4_PCTR_0_EN, 0);
+}
+
+void v3d_perfmon_start(struct v3d_dev *v3d, struct v3d_perfmon *perfmon)
+{
+       guard(spinlock_irqsave)(&v3d->perfmon_state.lock);
+
+       perfmon = v3d->global_perfmon ?: perfmon;
+
+       if (!perfmon)
+               return;
+
+       if (perfmon == v3d->perfmon_state.active)
+               return;
+
+       if (!pm_runtime_get_if_active(v3d->drm.dev))
+               return;
+
+       v3d_perfmon_hw_start(v3d, perfmon);
+       v3d->perfmon_state.active = perfmon;
+
+       v3d_pm_runtime_put(v3d);
+}
+
+void v3d_perfmon_stop_locked(struct v3d_dev *v3d, struct v3d_perfmon *perfmon,
+                            bool capture)
+{
+       lockdep_assert_held(&v3d->perfmon_state.lock);
+
+       if (!perfmon || perfmon != v3d->perfmon_state.active)
+               return;
+
+       if (!pm_runtime_get_if_active(v3d->drm.dev)) {
+               v3d->perfmon_state.active = NULL;
+               return;
+       }
+
+       v3d_perfmon_hw_stop(v3d, perfmon, capture);
+       v3d->perfmon_state.active = NULL;
 
        v3d_pm_runtime_put(v3d);
 }
@@ -267,30 +304,31 @@ void v3d_perfmon_start(struct v3d_dev *v3d, struct 
v3d_perfmon *perfmon)
 void v3d_perfmon_stop(struct v3d_dev *v3d, struct v3d_perfmon *perfmon,
                      bool capture)
 {
-       unsigned int i;
-
-       if (!perfmon || !v3d->active_perfmon)
+       if (!perfmon)
                return;
 
-       mutex_lock(&perfmon->lock);
-       if (perfmon != v3d->active_perfmon)
-               goto out;
+       guard(spinlock_irqsave)(&v3d->perfmon_state.lock);
+       v3d_perfmon_stop_locked(v3d, perfmon, capture);
+}
 
-       if (!pm_runtime_get_if_active(v3d->drm.dev))
-               goto out_clear;
+void v3d_perfmon_suspend(struct v3d_dev *v3d)
+{
+       guard(spinlock_irqsave)(&v3d->perfmon_state.lock);
 
-       if (capture)
-               for (i = 0; i < perfmon->ncounters; i++)
-                       perfmon->values[i] += V3D_CORE_READ(0, 
V3D_PCTR_0_PCTRX(i));
+       if (!v3d->perfmon_state.active)
+               return;
 
-       V3D_CORE_WRITE(0, V3D_V4_PCTR_0_EN, 0);
+       v3d_perfmon_hw_stop(v3d, v3d->perfmon_state.active, true);
+}
 
-       v3d_pm_runtime_put(v3d);
+void v3d_perfmon_resume(struct v3d_dev *v3d)
+{
+       guard(spinlock_irqsave)(&v3d->perfmon_state.lock);
 
-out_clear:
-       v3d->active_perfmon = NULL;
-out:
-       mutex_unlock(&perfmon->lock);
+       if (!v3d->perfmon_state.active)
+               return;
+
+       v3d_perfmon_hw_start(v3d, v3d->perfmon_state.active);
 }
 
 struct v3d_perfmon *v3d_perfmon_find(struct v3d_file_priv *v3d_priv, int id)
@@ -316,11 +354,15 @@ static void v3d_perfmon_delete(struct v3d_file_priv 
*v3d_priv,
        struct v3d_dev *v3d = v3d_priv->v3d;
 
        /* If the active perfmon is being destroyed, stop it first */
-       if (perfmon == v3d->active_perfmon)
-               v3d_perfmon_stop(v3d, perfmon, false);
+       scoped_guard(spinlock_irqsave, &v3d->perfmon_state.lock) {
+               /* If the global perfmon is being destroyed, set it to NULL */
+               if (v3d->global_perfmon == perfmon) {
+                       v3d->global_perfmon = NULL;
+                       v3d_perfmon_put(perfmon);
+               }
 
-       /* If the global perfmon is being destroyed, set it to NULL */
-       cmpxchg(&v3d->global_perfmon, perfmon, NULL);
+               v3d_perfmon_stop_locked(v3d, perfmon, false);
+       }
 
        v3d_perfmon_put(perfmon);
 }
@@ -368,12 +410,10 @@ int v3d_perfmon_create_ioctl(struct drm_device *dev, void 
*data,
        perfmon->ncounters = req->ncounters;
 
        refcount_set(&perfmon->refcnt, 1);
-       mutex_init(&perfmon->lock);
 
        ret = xa_alloc(&v3d_priv->perfmons, &id, perfmon, xa_limit_32b,
                       GFP_KERNEL);
        if (ret < 0) {
-               mutex_destroy(&perfmon->lock);
                kfree(perfmon);
                return ret;
        }
@@ -471,16 +511,33 @@ int v3d_perfmon_set_global_ioctl(struct drm_device *dev, 
void *data,
 
        /* If the request is to clear the global performance monitor */
        if (req->flags & DRM_V3D_PERFMON_CLEAR_GLOBAL) {
-               if (!v3d->global_perfmon)
-                       return -EINVAL;
+               struct v3d_perfmon *old;
 
-               xchg(&v3d->global_perfmon, NULL);
+               scoped_guard(spinlock_irqsave, &v3d->perfmon_state.lock) {
+                       old = v3d->global_perfmon;
+                       if (!old) {
+                               v3d_perfmon_put(perfmon);
+                               return -EINVAL;
+                       }
+
+                       v3d->global_perfmon = NULL;
+                       v3d_perfmon_stop_locked(v3d, old, true);
+               }
+
+               v3d_perfmon_put(old);
+               v3d_perfmon_put(perfmon);
 
                return 0;
        }
 
-       if (cmpxchg(&v3d->global_perfmon, NULL, perfmon))
-               return -EBUSY;
+       scoped_guard(spinlock_irqsave, &v3d->perfmon_state.lock) {
+               if (v3d->perfmon_state.active || v3d->global_perfmon) {
+                       v3d_perfmon_put(perfmon);
+                       return -EBUSY;
+               }
+
+               v3d->global_perfmon = perfmon;
+       }
 
        return 0;
 }
diff --git a/drivers/gpu/drm/v3d/v3d_power.c b/drivers/gpu/drm/v3d/v3d_power.c
index 769e90032b04..1a4b651a2c5f 100644
--- a/drivers/gpu/drm/v3d/v3d_power.c
+++ b/drivers/gpu/drm/v3d/v3d_power.c
@@ -50,6 +50,8 @@ int v3d_power_suspend(struct device *dev)
        struct v3d_dev *v3d = to_v3d_dev(drm);
        int ret;
 
+       v3d_perfmon_suspend(v3d);
+
        v3d_irq_disable(v3d);
 
        ret = v3d_suspend_sms(v3d);
@@ -83,5 +85,7 @@ int v3d_power_resume(struct device *dev)
        v3d_mmu_set_page_table(v3d);
        v3d_irq_enable(v3d);
 
+       v3d_perfmon_resume(v3d);
+
        return 0;
 }
diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
index 96d476b20abd..ae7f08e60b9e 100644
--- a/drivers/gpu/drm/v3d/v3d_sched.c
+++ b/drivers/gpu/drm/v3d/v3d_sched.c
@@ -139,24 +139,6 @@ v3d_cpu_job_free(struct drm_sched_job *sched_job)
        v3d_job_cleanup(&job->base);
 }
 
-static void
-v3d_switch_perfmon(struct v3d_dev *v3d, struct v3d_job *job)
-{
-       struct v3d_perfmon *perfmon = v3d->global_perfmon;
-
-       if (!perfmon)
-               perfmon = job->perfmon;
-
-       if (perfmon == v3d->active_perfmon)
-               return;
-
-       if (perfmon != v3d->active_perfmon)
-               v3d_perfmon_stop(v3d, v3d->active_perfmon, true);
-
-       if (perfmon && v3d->active_perfmon != perfmon)
-               v3d_perfmon_start(v3d, perfmon);
-}
-
 static void
 v3d_stats_start(struct v3d_stats *stats, u64 now)
 {
@@ -233,7 +215,7 @@ static struct dma_fence *v3d_bin_job_run(struct 
drm_sched_job *sched_job)
                            job->start, job->end);
 
        v3d_job_start_stats(&job->base);
-       v3d_switch_perfmon(v3d, &job->base);
+       v3d_perfmon_start(v3d, job->base.perfmon);
 
        /* Set the current and end address of the control list.
         * Writing the end register is what starts the job.
@@ -291,7 +273,7 @@ static struct dma_fence *v3d_render_job_run(struct 
drm_sched_job *sched_job)
                            job->start, job->end);
 
        v3d_job_start_stats(&job->base);
-       v3d_switch_perfmon(v3d, &job->base);
+       v3d_perfmon_start(v3d, job->base.perfmon);
 
        /* XXX: Set the QCFG */
 
@@ -384,7 +366,7 @@ v3d_csd_job_run(struct drm_sched_job *sched_job)
        trace_v3d_submit_csd(dev, to_v3d_fence(fence)->seqno);
 
        v3d_job_start_stats(&job->base);
-       v3d_switch_perfmon(v3d, &job->base);
+       v3d_perfmon_start(v3d, job->base.perfmon);
 
        csd_cfg0_reg = V3D_CSD_QUEUED_CFG0(v3d->ver);
        for (i = 1; i <= 6; i++)
diff --git a/drivers/gpu/drm/v3d/v3d_submit.c b/drivers/gpu/drm/v3d/v3d_submit.c
index d1ce1a7dbcf4..1047aca58282 100644
--- a/drivers/gpu/drm/v3d/v3d_submit.c
+++ b/drivers/gpu/drm/v3d/v3d_submit.c
@@ -210,6 +210,7 @@ v3d_submit_add_job(struct v3d_submit *submit, void 
**container, size_t size,
        job->v3d = v3d;
        job->free = free;
        job->file_priv = v3d_priv;
+       job->queue = queue;
 
        ret = drm_sched_job_init(&job->base, &v3d_priv->sched_entity[queue],
                                 1, v3d_priv, submit->file_priv->client_id);
@@ -251,8 +252,10 @@ v3d_attach_perfmon_to_jobs(struct v3d_submit *submit, u32 
perfmon_id)
        if (!perfmon_id)
                return 0;
 
-       if (v3d->global_perfmon)
-               return -EAGAIN;
+       scoped_guard(spinlock_irqsave, &v3d->perfmon_state.lock) {
+               if (v3d->global_perfmon)
+                       return -EAGAIN;
+       }
 
        perfmon = v3d_perfmon_find(v3d_priv, perfmon_id);
        if (!perfmon)

-- 
2.54.0

Reply via email to