v3d exposes a single set of performance counters per core, so at any
moment at most one performance monitor can be programmed in HW. In
software, this singleton is represented by v3d_dev->active_perfmon, but
until now nothing actually serialized access to it: scheduler callbacks,
the GPU-reset path, and perfmon ioctls all read and wrote that field
lock-free.

The existence of v3d_perfmon->lock mutex did not close the gap. It
serialized start/stop of *one* perfmon object against itself, but the
invariant that needs protection is device-wide: there can be exactly one
active perfmon at any moment in HW. Two threads acting on different
perfmon objects could race through v3d_dev->active_perfmon and the
counter registers, leaving software and HW out of sync.

This commit moves the locking to where the invariant actually lives. Group
the active perfmon pointer with a device-wide spinlock and route every
state transition (job start, job completion, set global, reset,
suspend/resume, destruction) through a small set of locked entry points
that are the only mutators of the HW counters.

Some design improvements needed to be made for the refactor:

  1. Stop the perfmon from the IRQ handler at job-completion time (the
     natural boundary for "active perfmon follows the active job"). This
     required a change from a mutex to a spinlock. This solves another
     issue of the existing design: perfmon start/stop was exclusively
     attached to run_job() callbacks, which means that if nothing was
     further queued up, a perfmon would never actually be stopped.

  2. Pause/resume the HW counters across runtime-PM transitions without
     dropping the software reference. This preserves the perfmon state
     while the device is idle.

  3. Move the global perfmon lifecycle management to the set_global
     IOCTL. This simplifies the logic in v3d_perfmon_start() and
     v3d_perfmon_stop(), as there is no need to always check if the
     global perfmon is enabled.

  4. v3d_perfmon_get_values_ioctl() doesn't stop the perfmon when
     capturing the values. All lifecycle management is handled by the
     job (for per-job perfmons) or the set_global IOCTL (for global
     perfmons).

Signed-off-by: Maíra Canal <[email protected]>
---
 drivers/gpu/drm/v3d/v3d_drv.h     |  17 ++--
 drivers/gpu/drm/v3d/v3d_gem.c     |   4 +-
 drivers/gpu/drm/v3d/v3d_irq.c     |   7 +-
 drivers/gpu/drm/v3d/v3d_perfmon.c | 183 +++++++++++++++++++++++++++-----------
 drivers/gpu/drm/v3d/v3d_power.c   |   4 +
 drivers/gpu/drm/v3d/v3d_sched.c   |  26 ++----
 drivers/gpu/drm/v3d/v3d_submit.c  |   6 +-
 7 files changed, 165 insertions(+), 82 deletions(-)

diff --git a/drivers/gpu/drm/v3d/v3d_drv.h b/drivers/gpu/drm/v3d/v3d_drv.h
index 071d919fe860..51486af68cf4 100644
--- a/drivers/gpu/drm/v3d/v3d_drv.h
+++ b/drivers/gpu/drm/v3d/v3d_drv.h
@@ -86,9 +86,6 @@ struct v3d_perfmon {
         */
        refcount_t refcnt;
 
-       /* Protects perfmon stop, as it can be invoked from multiple places. */
-       struct mutex lock;
-
        /* Number of counters activated in this perfmon instance
         * (should be less than DRM_V3D_MAX_PERF_COUNTERS).
         */
@@ -170,8 +167,14 @@ struct v3d_dev {
 
        struct v3d_queue_state queue[V3D_MAX_QUEUES];
 
-       /* Used to track the active perfmon if any. */
-       struct v3d_perfmon *active_perfmon;
+       /* Tracks the performance monitor state. */
+       struct {
+               /* Protects @active. */
+               spinlock_t lock;
+
+               /* Perfmon currently programmed in HW (or NULL if none). */
+               struct v3d_perfmon *active;
+       } perfmon_state;
 
        /* Protects bo_stats */
        struct mutex bo_lock;
@@ -663,6 +666,10 @@ void v3d_perfmon_put(struct v3d_perfmon *perfmon);
 void v3d_perfmon_start(struct v3d_dev *v3d, struct v3d_perfmon *perfmon);
 void v3d_perfmon_stop(struct v3d_dev *v3d, struct v3d_perfmon *perfmon,
                      bool capture);
+void v3d_perfmon_stop_locked(struct v3d_dev *v3d, struct v3d_perfmon *perfmon,
+                            bool capture);
+void v3d_perfmon_suspend(struct v3d_dev *v3d);
+void v3d_perfmon_resume(struct v3d_dev *v3d);
 struct v3d_perfmon *v3d_perfmon_find(struct v3d_file_priv *v3d_priv, int id);
 void v3d_perfmon_open_file(struct v3d_file_priv *v3d_priv);
 void v3d_perfmon_close_file(struct v3d_file_priv *v3d_priv);
diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c
index 1ee3c038d5f6..9487ab7acd03 100644
--- a/drivers/gpu/drm/v3d/v3d_gem.c
+++ b/drivers/gpu/drm/v3d/v3d_gem.c
@@ -137,7 +137,8 @@ v3d_reset(struct v3d_dev *v3d)
        v3d_mmu_set_page_table(v3d);
        v3d_irq_reset(v3d);
 
-       v3d_perfmon_stop(v3d, v3d->active_perfmon, false);
+       /* Re-arm the global perfmon HW counters that the reset zeroed. */
+       v3d_perfmon_resume(v3d);
 
        trace_v3d_reset_end(dev);
 }
@@ -299,6 +300,7 @@ v3d_gem_init(struct drm_device *dev)
        }
 
        spin_lock_init(&v3d->mm_lock);
+       spin_lock_init(&v3d->perfmon_state.lock);
        ret = drmm_mutex_init(dev, &v3d->bo_lock);
        if (ret)
                goto err_stats;
diff --git a/drivers/gpu/drm/v3d/v3d_irq.c b/drivers/gpu/drm/v3d/v3d_irq.c
index 754a969b862b..41fce1f8f96c 100644
--- a/drivers/gpu/drm/v3d/v3d_irq.c
+++ b/drivers/gpu/drm/v3d/v3d_irq.c
@@ -87,9 +87,12 @@ v3d_irq_signal_fence(struct v3d_dev *v3d, enum v3d_queue q,
                     void (*trace_irq)(struct drm_device *, uint64_t))
 {
        struct v3d_queue_state *queue = &v3d->queue[q];
-       struct v3d_fence *fence = to_v3d_fence(queue->active_job->irq_fence);
+       struct v3d_job *job = queue->active_job;
+       struct v3d_fence *fence = to_v3d_fence(job->irq_fence);
 
-       v3d_job_update_stats(queue->active_job);
+       v3d_perfmon_stop(v3d, job->perfmon, true);
+
+       v3d_job_update_stats(job);
        trace_irq(&v3d->drm, fence->seqno);
 
        queue->active_job = NULL;
diff --git a/drivers/gpu/drm/v3d/v3d_perfmon.c 
b/drivers/gpu/drm/v3d/v3d_perfmon.c
index 48ae748247be..3ad0f022753c 100644
--- a/drivers/gpu/drm/v3d/v3d_perfmon.c
+++ b/drivers/gpu/drm/v3d/v3d_perfmon.c
@@ -217,26 +217,15 @@ void v3d_perfmon_get(struct v3d_perfmon *perfmon)
 
 void v3d_perfmon_put(struct v3d_perfmon *perfmon)
 {
-       if (perfmon && refcount_dec_and_test(&perfmon->refcnt)) {
-               mutex_destroy(&perfmon->lock);
+       if (perfmon && refcount_dec_and_test(&perfmon->refcnt))
                kfree(perfmon);
-       }
 }
 
-void v3d_perfmon_start(struct v3d_dev *v3d, struct v3d_perfmon *perfmon)
+static void v3d_perfmon_hw_start(struct v3d_dev *v3d, struct v3d_perfmon 
*perfmon)
 {
+       u8 ncounters = perfmon->ncounters;
+       u32 mask = GENMASK(ncounters - 1, 0);
        unsigned int i;
-       u32 mask;
-       u8 ncounters;
-
-       if (WARN_ON_ONCE(!perfmon || v3d->active_perfmon))
-               return;
-
-       if (!pm_runtime_get_if_active(v3d->drm.dev))
-               return;
-
-       ncounters = perfmon->ncounters;
-       mask = GENMASK(ncounters - 1, 0);
 
        for (i = 0; i < ncounters; i++) {
                u32 source = i / 4;
@@ -258,39 +247,106 @@ void v3d_perfmon_start(struct v3d_dev *v3d, struct 
v3d_perfmon *perfmon)
        V3D_CORE_WRITE(0, V3D_V4_PCTR_0_EN, mask);
        V3D_CORE_WRITE(0, V3D_V4_PCTR_0_CLR, mask);
        V3D_CORE_WRITE(0, V3D_PCTR_0_OVERFLOW, mask);
+}
 
-       v3d->active_perfmon = perfmon;
+static void v3d_perfmon_hw_capture(struct v3d_dev *v3d, struct v3d_perfmon 
*perfmon)
+{
+       u32 mask = GENMASK(perfmon->ncounters - 1, 0);
 
+       for (int i = 0; i < perfmon->ncounters; i++)
+               perfmon->values[i] += V3D_CORE_READ(0, V3D_PCTR_0_PCTRX(i));
+
+       V3D_CORE_WRITE(0, V3D_V4_PCTR_0_CLR, mask);
+}
+
+static void v3d_perfmon_hw_stop(struct v3d_dev *v3d, struct v3d_perfmon 
*perfmon,
+                               bool capture)
+{
+       if (capture)
+               v3d_perfmon_hw_capture(v3d, perfmon);
+
+       V3D_CORE_WRITE(0, V3D_V4_PCTR_0_EN, 0);
+}
+
+void v3d_perfmon_start(struct v3d_dev *v3d, struct v3d_perfmon *perfmon)
+{
+       guard(spinlock_irqsave)(&v3d->perfmon_state.lock);
+
+       if (!perfmon || v3d->global_perfmon)
+               return;
+
+       if (!pm_runtime_get_if_active(v3d->drm.dev))
+               return;
+
+       v3d_perfmon_hw_start(v3d, perfmon);
+       v3d->perfmon_state.active = perfmon;
+
+       v3d_pm_runtime_put(v3d);
+}
+
+static void v3d_perfmon_capture_locked(struct v3d_dev *v3d,
+                                      struct v3d_perfmon *perfmon)
+{
+       lockdep_assert_held(&v3d->perfmon_state.lock);
+
+       if (!perfmon || perfmon != v3d->perfmon_state.active)
+               return;
+
+       if (!pm_runtime_get_if_active(v3d->drm.dev))
+               return;
+
+       v3d_perfmon_hw_capture(v3d, perfmon);
+       v3d_pm_runtime_put(v3d);
+}
+
+void v3d_perfmon_stop_locked(struct v3d_dev *v3d, struct v3d_perfmon *perfmon,
+                            bool capture)
+{
+       lockdep_assert_held(&v3d->perfmon_state.lock);
+
+       if (!perfmon || perfmon != v3d->perfmon_state.active)
+               return;
+
+       v3d->perfmon_state.active = NULL;
+
+       /* If the device is suspended, the HW has already stopped counting. */
+       if (!pm_runtime_get_if_active(v3d->drm.dev))
+               return;
+
+       v3d_perfmon_hw_stop(v3d, perfmon, capture);
        v3d_pm_runtime_put(v3d);
 }
 
 void v3d_perfmon_stop(struct v3d_dev *v3d, struct v3d_perfmon *perfmon,
                      bool capture)
 {
-       unsigned int i;
-
-       if (!perfmon || !v3d->active_perfmon)
+       if (!perfmon)
                return;
 
-       mutex_lock(&perfmon->lock);
-       if (perfmon != v3d->active_perfmon)
-               goto out;
+       guard(spinlock_irqsave)(&v3d->perfmon_state.lock);
+       v3d_perfmon_stop_locked(v3d, perfmon, capture);
+}
 
-       if (!pm_runtime_get_if_active(v3d->drm.dev))
-               goto out_clear;
+void
+v3d_perfmon_suspend(struct v3d_dev *v3d)
+{
+       guard(spinlock_irqsave)(&v3d->perfmon_state.lock);
 
-       if (capture)
-               for (i = 0; i < perfmon->ncounters; i++)
-                       perfmon->values[i] += V3D_CORE_READ(0, 
V3D_PCTR_0_PCTRX(i));
+       if (!v3d->perfmon_state.active)
+               return;
 
-       V3D_CORE_WRITE(0, V3D_V4_PCTR_0_EN, 0);
+       v3d_perfmon_hw_stop(v3d, v3d->perfmon_state.active, true);
+}
 
-       v3d_pm_runtime_put(v3d);
+void
+v3d_perfmon_resume(struct v3d_dev *v3d)
+{
+       guard(spinlock_irqsave)(&v3d->perfmon_state.lock);
 
-out_clear:
-       v3d->active_perfmon = NULL;
-out:
-       mutex_unlock(&perfmon->lock);
+       if (!v3d->perfmon_state.active)
+               return;
+
+       v3d_perfmon_hw_start(v3d, v3d->perfmon_state.active);
 }
 
 struct v3d_perfmon *v3d_perfmon_find(struct v3d_file_priv *v3d_priv, int id)
@@ -316,14 +372,17 @@ static void v3d_perfmon_delete(struct v3d_file_priv 
*v3d_priv,
        struct v3d_dev *v3d = v3d_priv->v3d;
 
        /* If the active perfmon is being destroyed, stop it first */
-       if (perfmon == v3d->active_perfmon)
-               v3d_perfmon_stop(v3d, perfmon, false);
+       scoped_guard(spinlock_irqsave, &v3d->perfmon_state.lock) {
+               v3d_perfmon_stop_locked(v3d, perfmon, false);
 
-       /* If the global perfmon is being destroyed, clean it and release
-        * the reference stashed in v3d_perfmon_set_global_ioctl().
-        */
-       if (cmpxchg(&v3d->global_perfmon, perfmon, NULL) == perfmon)
-               v3d_perfmon_put(perfmon);
+               /* If the global perfmon is being destroyed, clean it and 
release
+                * the reference stashed in v3d_perfmon_set_global_ioctl().
+                */
+               if (v3d->global_perfmon == perfmon) {
+                       v3d_perfmon_put(v3d->global_perfmon);
+                       v3d->global_perfmon = NULL;
+               }
+       }
 
        v3d_perfmon_put(perfmon);
 }
@@ -371,12 +430,10 @@ int v3d_perfmon_create_ioctl(struct drm_device *dev, void 
*data,
        perfmon->ncounters = req->ncounters;
 
        refcount_set(&perfmon->refcnt, 1);
-       mutex_init(&perfmon->lock);
 
        ret = xa_alloc(&v3d_priv->perfmons, &id, perfmon, xa_limit_32b,
                       GFP_KERNEL);
        if (ret < 0) {
-               mutex_destroy(&perfmon->lock);
                kfree(perfmon);
                return ret;
        }
@@ -408,7 +465,9 @@ int v3d_perfmon_get_values_ioctl(struct drm_device *dev, 
void *data,
        struct v3d_dev *v3d = to_v3d_dev(dev);
        struct v3d_file_priv *v3d_priv = file_priv->driver_priv;
        struct drm_v3d_perfmon_get_values *req = data;
+       u64 values[DRM_V3D_MAX_PERF_COUNTERS];
        struct v3d_perfmon *perfmon;
+       size_t size;
        int ret = 0;
 
        if (req->pad != 0)
@@ -418,10 +477,14 @@ int v3d_perfmon_get_values_ioctl(struct drm_device *dev, 
void *data,
        if (!perfmon)
                return -EINVAL;
 
-       v3d_perfmon_stop(v3d, perfmon, true);
+       size = perfmon->ncounters * sizeof(u64);
 
-       if (copy_to_user(u64_to_user_ptr(req->values_ptr), perfmon->values,
-                        perfmon->ncounters * sizeof(u64)))
+       scoped_guard(spinlock_irqsave, &v3d->perfmon_state.lock) {
+               v3d_perfmon_capture_locked(v3d, perfmon);
+               memcpy(values, perfmon->values, size);
+       }
+
+       if (copy_to_user(u64_to_user_ptr(req->values_ptr), values, size))
                ret = -EFAULT;
 
        v3d_perfmon_put(perfmon);
@@ -482,18 +545,36 @@ int v3d_perfmon_set_global_ioctl(struct drm_device *dev, 
void *data,
                 */
                v3d_perfmon_put(perfmon);
 
-               old = xchg(&v3d->global_perfmon, NULL);
-               if (!old)
-                       return -EINVAL;
+               scoped_guard(spinlock_irqsave, &v3d->perfmon_state.lock) {
+                       old = v3d->global_perfmon;
+                       if (!old)
+                               return -EINVAL;
+
+                       v3d_perfmon_stop_locked(v3d, old, true);
+                       v3d->global_perfmon = NULL;
+               }
 
                v3d_perfmon_put(old);
 
                return 0;
        }
 
-       if (cmpxchg(&v3d->global_perfmon, NULL, perfmon)) {
-               v3d_perfmon_put(perfmon);
-               return -EBUSY;
+       scoped_guard(spinlock_irqsave, &v3d->perfmon_state.lock) {
+               if (v3d->perfmon_state.active || v3d->global_perfmon) {
+                       v3d_perfmon_put(perfmon);
+                       return -EBUSY;
+               }
+
+               v3d->global_perfmon = perfmon;
+               v3d->perfmon_state.active = perfmon;
+
+               /* If the device is suspended, v3d_perfmon_resume() will
+                * program the HW on the next resume.
+                */
+               if (pm_runtime_get_if_active(v3d->drm.dev)) {
+                       v3d_perfmon_hw_start(v3d, perfmon);
+                       v3d_pm_runtime_put(v3d);
+               }
        }
 
        return 0;
diff --git a/drivers/gpu/drm/v3d/v3d_power.c b/drivers/gpu/drm/v3d/v3d_power.c
index 769e90032b04..1a4b651a2c5f 100644
--- a/drivers/gpu/drm/v3d/v3d_power.c
+++ b/drivers/gpu/drm/v3d/v3d_power.c
@@ -50,6 +50,8 @@ int v3d_power_suspend(struct device *dev)
        struct v3d_dev *v3d = to_v3d_dev(drm);
        int ret;
 
+       v3d_perfmon_suspend(v3d);
+
        v3d_irq_disable(v3d);
 
        ret = v3d_suspend_sms(v3d);
@@ -83,5 +85,7 @@ int v3d_power_resume(struct device *dev)
        v3d_mmu_set_page_table(v3d);
        v3d_irq_enable(v3d);
 
+       v3d_perfmon_resume(v3d);
+
        return 0;
 }
diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
index c16a9d4d41e6..4d2b91d49542 100644
--- a/drivers/gpu/drm/v3d/v3d_sched.c
+++ b/drivers/gpu/drm/v3d/v3d_sched.c
@@ -125,24 +125,6 @@ v3d_performance_query_info_free(struct 
v3d_performance_query_info *query_info,
        }
 }
 
-static void
-v3d_switch_perfmon(struct v3d_dev *v3d, struct v3d_job *job)
-{
-       struct v3d_perfmon *perfmon = v3d->global_perfmon;
-
-       if (!perfmon)
-               perfmon = job->perfmon;
-
-       if (perfmon == v3d->active_perfmon)
-               return;
-
-       if (perfmon != v3d->active_perfmon)
-               v3d_perfmon_stop(v3d, v3d->active_perfmon, true);
-
-       if (perfmon && v3d->active_perfmon != perfmon)
-               v3d_perfmon_start(v3d, perfmon);
-}
-
 static void
 v3d_stats_start(struct v3d_stats *stats, u64 now)
 {
@@ -219,7 +201,7 @@ static struct dma_fence *v3d_bin_job_run(struct 
drm_sched_job *sched_job)
                            job->start, job->end);
 
        v3d_job_start_stats(&job->base);
-       v3d_switch_perfmon(v3d, &job->base);
+       v3d_perfmon_start(v3d, job->base.perfmon);
 
        /* Set the current and end address of the control list.
         * Writing the end register is what starts the job.
@@ -277,7 +259,7 @@ static struct dma_fence *v3d_render_job_run(struct 
drm_sched_job *sched_job)
                            job->start, job->end);
 
        v3d_job_start_stats(&job->base);
-       v3d_switch_perfmon(v3d, &job->base);
+       v3d_perfmon_start(v3d, job->base.perfmon);
 
        /* XXX: Set the QCFG */
 
@@ -370,7 +352,7 @@ v3d_csd_job_run(struct drm_sched_job *sched_job)
        trace_v3d_submit_csd(dev, to_v3d_fence(fence)->seqno);
 
        v3d_job_start_stats(&job->base);
-       v3d_switch_perfmon(v3d, &job->base);
+       v3d_perfmon_start(v3d, job->base.perfmon);
 
        csd_cfg0_reg = V3D_CSD_QUEUED_CFG0(v3d->ver);
        for (i = 1; i <= 6; i++)
@@ -711,6 +693,8 @@ v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct 
drm_sched_job *sched_job,
        if (sched_job)
                drm_sched_increase_karma(sched_job);
 
+       v3d_perfmon_stop(v3d, job->perfmon, false);
+
        /* get the GPU back into the init state */
        v3d_reset(v3d);
 
diff --git a/drivers/gpu/drm/v3d/v3d_submit.c b/drivers/gpu/drm/v3d/v3d_submit.c
index 02441d4f495d..4c526aafc4e0 100644
--- a/drivers/gpu/drm/v3d/v3d_submit.c
+++ b/drivers/gpu/drm/v3d/v3d_submit.c
@@ -275,8 +275,10 @@ v3d_attach_perfmon_to_jobs(struct v3d_submit *submit, u32 
perfmon_id)
        if (!perfmon_id)
                return 0;
 
-       if (v3d->global_perfmon)
-               return -EAGAIN;
+       scoped_guard(spinlock_irqsave, &v3d->perfmon_state.lock) {
+               if (v3d->global_perfmon)
+                       return -EAGAIN;
+       }
 
        perfmon = v3d_perfmon_find(v3d_priv, perfmon_id);
        if (!perfmon)

-- 
2.54.0

Reply via email to