Right now, if there's a HW reset and an HWPerf session is active,
panfrost_mmu_reset() will reset the AS count for every single open file's
mmu struct back to 0, and also invalidate their AS numbers. Then, when
disabling hwperf, panfrost_mmu_as_put() will WARN that mmu->as_count is
less than zero.

Fix this by introducing a perfcnt HW reset path.

The choice was made to render perfcnt unusable after reset, so that a
user might have to reprogram it with a full disable/enable sequence
before requesting more perfcnt dumps.

Reported-by: Claude <[email protected]>
Closes: https://gitlab.freedesktop.org/panfrost/linux/-/work_items/88
Signed-off-by: Adrián Larumbe <[email protected]>
Fixes: 7786fd108777 ("drm/panfrost: Expose performance counters through 
unstable ioctls")
---
 drivers/gpu/drm/panfrost/panfrost_device.c  |  1 +
 drivers/gpu/drm/panfrost/panfrost_perfcnt.c | 46 ++++++++++++++++++++++++++++-
 drivers/gpu/drm/panfrost/panfrost_perfcnt.h |  1 +
 3 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/panfrost/panfrost_device.c 
b/drivers/gpu/drm/panfrost/panfrost_device.c
index 87b372c9e675..2805d50c1b9b 100644
--- a/drivers/gpu/drm/panfrost/panfrost_device.c
+++ b/drivers/gpu/drm/panfrost/panfrost_device.c
@@ -426,6 +426,7 @@ bool panfrost_exception_needs_reset(const struct 
panfrost_device *pfdev,
 
 void panfrost_device_reset(struct panfrost_device *pfdev, bool enable_job_int)
 {
+       panfrost_perfcnt_reset(pfdev);
        panfrost_gpu_soft_reset(pfdev);
 
        panfrost_gpu_power_on(pfdev);
diff --git a/drivers/gpu/drm/panfrost/panfrost_perfcnt.c 
b/drivers/gpu/drm/panfrost/panfrost_perfcnt.c
index ad1156678e91..c2087ea705fe 100644
--- a/drivers/gpu/drm/panfrost/panfrost_perfcnt.c
+++ b/drivers/gpu/drm/panfrost/panfrost_perfcnt.c
@@ -33,6 +33,7 @@ struct panfrost_perfcnt {
        struct panfrost_file_priv *user;
        struct mutex lock;
        struct completion dump_comp;
+       atomic_t hw_reset_happened;
 };
 
 static void panfrost_perfcnt_gpu_disable(struct panfrost_device *pfdev)
@@ -57,9 +58,13 @@ void panfrost_perfcnt_sample_done(struct panfrost_device 
*pfdev)
 
 static int panfrost_perfcnt_dump_locked(struct panfrost_device *pfdev)
 {
+       struct panfrost_perfcnt *perfcnt = pfdev->perfcnt;
        u64 gpuva;
        int ret;
 
+       if (atomic_read(&perfcnt->hw_reset_happened))
+               return -EIO;
+
        reinit_completion(&pfdev->perfcnt->dump_comp);
        gpuva = pfdev->perfcnt->mapping->mmnode.start << PAGE_SHIFT;
        gpu_write(pfdev, GPU_PERFCNT_BASE_LO, lower_32_bits(gpuva));
@@ -140,6 +145,15 @@ static int panfrost_perfcnt_enable_locked(struct 
panfrost_device *pfdev,
                goto err_vunmap;
        }
 
+       /* If a reset is ongoing, the AS we get right below will be torn
+        * down, so rather than waiting until this becomes obvious in a
+        * perfcnt_dump() ioctl, we ask the user to try again slightly later.
+        */
+       if (atomic_read(&pfdev->reset.pending)) {
+               ret = -EAGAIN;
+               goto err_vunmap;
+       }
+
        ret = panfrost_mmu_as_get(pfdev, perfcnt->mapping->mmu);
        if (ret < 0)
                goto err_vunmap;
@@ -173,6 +187,16 @@ static int panfrost_perfcnt_enable_locked(struct 
panfrost_device *pfdev,
        if (panfrost_has_hw_issue(pfdev, HW_ISSUE_8186))
                gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0xffffffff);
 
+       /* If a reset happened, we've no way of knowing whether it was between 
the time we called
+        * panfrost_mmu_as_get() or before perfcnt_enable(), so clearing this 
flag and going forward
+        * isn't possible. We must clear the flag and try again in the hopes no 
resets will happen
+        * between this and the next ioctl invocation.
+        */
+       if (atomic_cmpxchg(&perfcnt->hw_reset_happened, 1, 0)) {
+               ret = EAGAIN;
+               goto err_disable;
+       }
+
        /* The BO ref is retained by the mapping. */
        drm_gem_object_put(&bo->base);
 
@@ -180,6 +204,8 @@ static int panfrost_perfcnt_enable_locked(struct 
panfrost_device *pfdev,
 
        return 0;
 
+err_disable:
+       panfrost_perfcnt_gpu_disable(pfdev);
 err_vunmap:
        drm_gem_vunmap(&bo->base, &map);
 err_put_mapping:
@@ -209,7 +235,8 @@ static int panfrost_perfcnt_disable_locked(struct 
panfrost_device *pfdev,
        drm_gem_vunmap(&perfcnt->mapping->obj->base.base, &map);
        perfcnt->buf = NULL;
        panfrost_gem_close(&perfcnt->mapping->obj->base.base, file_priv);
-       panfrost_mmu_as_put(pfdev, perfcnt->mapping->mmu);
+       if (!atomic_read(&perfcnt->hw_reset_happened))
+               panfrost_mmu_as_put(pfdev, perfcnt->mapping->mmu);
        panfrost_gem_mapping_put(perfcnt->mapping);
        perfcnt->mapping = NULL;
        pm_runtime_put_autosuspend(pfdev->base.dev);
@@ -346,3 +373,20 @@ void panfrost_perfcnt_fini(struct panfrost_device *pfdev)
        /* Disable everything before leaving. */
        panfrost_perfcnt_gpu_disable(pfdev);
 }
+
+void panfrost_perfcnt_reset(struct panfrost_device *pfdev)
+{
+       struct panfrost_perfcnt *perfcnt = pfdev->perfcnt;
+
+       /* Since this function will be called either from a scheduled HW reset
+        * or a runtime resume, tearing down any perfcnt resources means we're
+        * doomed to deadlocking with perfcnt_{enable/disable}, since we'd have
+        * to take the perfecnt lock. On top of that, it'd also violate DMA 
fence
+        * signalling rules because GFP_KERNEL allocations are made with the 
perfcnt
+        * lock taken in perfcnt_enable. In light of this, the only thing we 
can do
+        * is disabling perfcnt unconditionally, and notifying the perfcnt user 
of
+        * the reset having happpened so that they can take recovery measures.
+        */
+       panfrost_perfcnt_gpu_disable(pfdev);
+       atomic_set(&perfcnt->hw_reset_happened, 1);
+}
diff --git a/drivers/gpu/drm/panfrost/panfrost_perfcnt.h 
b/drivers/gpu/drm/panfrost/panfrost_perfcnt.h
index 8bbcf5f5fb33..8b9bc704b634 100644
--- a/drivers/gpu/drm/panfrost/panfrost_perfcnt.h
+++ b/drivers/gpu/drm/panfrost/panfrost_perfcnt.h
@@ -14,5 +14,6 @@ int panfrost_ioctl_perfcnt_enable(struct drm_device *dev, 
void *data,
                                  struct drm_file *file_priv);
 int panfrost_ioctl_perfcnt_dump(struct drm_device *dev, void *data,
                                struct drm_file *file_priv);
+void panfrost_perfcnt_reset(struct panfrost_device *pfdev);
 
 #endif

-- 
2.53.0

Reply via email to