Hi Rob,
On 06/03/26 13:36, Rob Herring (Arm) wrote:
The Arm Ethos-U NPUs have a PMU with performance counters. The PMU h/w
supports up to 4 (U65) or 8 (U85) counters which can be programmed for
different events. There is also a dedicated cycle counter.
The ABI and implementation are copied from the V3D driver. The main
I'm planning to send a series fixing some perfmon locking issues in the
V3D driver later this week. But, to give you a brief summary of the main
issues:
1. The active perfmon is not protected against races (run_job() vs.
IOCTLs).
2. ethosu_switch_perfmon() only starts/stops the perfmon in run_job().
This means that if nothing is further queued up, a perfmon will never
actually be stopped.
More comments inline:
difference in the ABI is there is no query API for the the event list.
The events differ between the U65 and U85, so the events lists are
maintained in userspace along with other differences between the U65 and
U85.
The cycle counter is always enabled when the PMU is enabled. When the
user requests N events, reading the counters will return the N events
plus the cycle counter.
Signed-off-by: Rob Herring (Arm) <[email protected]>
---
MR for mesa with initial support is here:
https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40269
drivers/accel/ethosu/Makefile | 2 +-
drivers/accel/ethosu/ethosu_device.h | 21 ++
drivers/accel/ethosu/ethosu_drv.c | 17 +-
drivers/accel/ethosu/ethosu_drv.h | 62 ++++++
drivers/accel/ethosu/ethosu_job.c | 35 +++-
drivers/accel/ethosu/ethosu_job.h | 2 +
drivers/accel/ethosu/ethosu_perfmon.c | 282 ++++++++++++++++++++++++++
include/uapi/drm/ethosu_accel.h | 59 +++++-
8 files changed, 469 insertions(+), 11 deletions(-)
create mode 100644 drivers/accel/ethosu/ethosu_perfmon.c
[...]
diff --git a/drivers/accel/ethosu/ethosu_drv.h
b/drivers/accel/ethosu/ethosu_drv.h
index 9e21dfe94184..57eee79e4b83 100644
--- a/drivers/accel/ethosu/ethosu_drv.h
+++ b/drivers/accel/ethosu/ethosu_drv.h
@@ -3,13 +3,75 @@
#ifndef __ETHOSU_DRV_H__
#define __ETHOSU_DRV_H__
+#include <linux/idr.h>
+#include <linux/mutex.h>
#include <drm/gpu_scheduler.h>
struct ethosu_device;
+struct drm_device;
+struct drm_file;
struct ethosu_file_priv {
struct ethosu_device *edev;
struct drm_sched_entity sched_entity;
+
+ struct {
+ struct idr idr;
You may want to consider moving to XArray as we did in V3D [1].
[1]
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0a5b0d095bcdb219348ed8ae1c97ee99fc4913b8
+ struct mutex lock;
+ } perfmon;
+};
+
+/* Performance monitor object. The perform lifetime is controlled by userspace
+ * using perfmon related ioctls. A perfmon can be attached to a submit_cl
+ * request, and when this is the case, HW perf counters will be activated just
+ * before the submit_cl is submitted to the GPU and disabled when the job is
+ * done. This way, only events related to a specific job will be counted.
+ */
+struct ethosu_perfmon {
+ /* Tracks the number of users of the perfmon, when this counter reaches
+ * zero the perfmon is destroyed.
+ */
+ refcount_t refcnt;
+
+ /* Protects perfmon stop, as it can be invoked from multiple places. */
+ struct mutex lock;
I'm not familiar with the Ethos hardware, but *if* it has only a single
performance monitor active in HW at any given moment (like v3d), this
lock doesn't enforce this restriction.
This lock only serializes start/stop of *one* perfmon object against
itself, but the invariant (at least on v3d) that needs protection is
device-wide: there is exactly one active perfmon at any moment in HW.
+
+ /* Number of counters activated in this perfmon instance
+ * (should be less than or equal to DRM_ETHOSU_MAX_PERF_COUNTERS).
+ */
+ u8 ncounters;
+
+ /* Events counted by the HW perf counters. */
+ u16 counters[DRM_ETHOSU_MAX_PERF_EVENT_COUNTERS];
+
+ /* Storage for counter values. Counters are incremented by the
+ * HW perf counter values every time the perfmon is attached
+ * to a GPU job. This way, perfmon users don't have to
s/GPU/NPU job maybe?
+ * retrieve the results after each job if they want to track
+ * events covering several submissions. Note that counter
+ * values can't be reset, but you can fake a reset by
+ * destroying the perfmon and creating a new one.
+ */
+ u64 values[] __counted_by(ncounters);
};
+/* ethosu_perfmon.c */
+void ethosu_perfmon_init(struct ethosu_device *ethosu);
+void ethosu_perfmon_get(struct ethosu_perfmon *perfmon);
+void ethosu_perfmon_put(struct ethosu_perfmon *perfmon);
+void ethosu_perfmon_start(struct ethosu_device *ethosu, struct ethosu_perfmon
*perfmon);
+void ethosu_perfmon_stop(struct ethosu_device *ethosu, struct ethosu_perfmon
*perfmon,
+ bool capture);
+struct ethosu_perfmon *ethosu_perfmon_find(struct ethosu_file_priv
*ethosu_priv, int id);
+void ethosu_perfmon_open_file(struct ethosu_file_priv *ethosu_priv);
+void ethosu_perfmon_close_file(struct ethosu_file_priv *ethosu_priv);
+int ethosu_ioctl_perfmon_create(struct drm_device *dev, void *data,
+ struct drm_file *file_priv);
+int ethosu_ioctl_perfmon_destroy(struct drm_device *dev, void *data,
+ struct drm_file *file_priv);
+int ethosu_ioctl_perfmon_get_values(struct drm_device *dev, void *data,
+ struct drm_file *file_priv);
+int ethosu_ioctl_perfmon_set_global(struct drm_device *dev, void *data,
+ struct drm_file *file_priv);
+
#endif
diff --git a/drivers/accel/ethosu/ethosu_job.c
b/drivers/accel/ethosu/ethosu_job.c
index ec85f4156744..8bf5a3aa8420 100644
--- a/drivers/accel/ethosu/ethosu_job.c
+++ b/drivers/accel/ethosu/ethosu_job.c
@@ -147,6 +147,8 @@ static void ethosu_job_err_cleanup(struct ethosu_job *job)
{
unsigned int i;
+ ethosu_perfmon_put(job->perfmon);
+
for (i = 0; i < job->region_cnt; i++)
drm_gem_object_put(job->region_bo[i]);
@@ -181,6 +183,24 @@ static void ethosu_job_free(struct drm_sched_job *sched_job)
ethosu_job_put(job);
}
+static void
+ethosu_switch_perfmon(struct ethosu_device *ethosu, struct ethosu_job *job)
+{
+ struct ethosu_perfmon *perfmon = ethosu->global_perfmon;
You might consider using READ_ONCE for global_perfmon, as this can race
with the set_global IOCTL.
+
+ if (!perfmon)
+ perfmon = job->perfmon;
+
+ if (perfmon == ethosu->active_perfmon)
+ return;
+
+ if (perfmon != ethosu->active_perfmon)
+ ethosu_perfmon_stop(ethosu, ethosu->active_perfmon, true);
+
+ if (perfmon && ethosu->active_perfmon != perfmon)
+ ethosu_perfmon_start(ethosu, perfmon);
+}
+
static struct dma_fence *ethosu_job_run(struct drm_sched_job *sched_job)
{
struct ethosu_job *job = to_ethosu_job(sched_job);
[...]
+int ethosu_ioctl_perfmon_set_global(struct drm_device *dev, void *data,
+ struct drm_file *file_priv)
+{
+ struct ethosu_file_priv *ethosu_priv = file_priv->driver_priv;
+ struct drm_ethosu_perfmon_set_global *req = data;
+ struct ethosu_device *ethosu = to_ethosu_device(dev);
+ struct ethosu_perfmon *perfmon;
+
+ if (req->flags & ~DRM_ETHOSU_PERFMON_CLEAR_GLOBAL)
+ return -EINVAL;
+
+ perfmon = ethosu_perfmon_find(ethosu_priv, req->id);
+ if (!perfmon)
+ return -EINVAL;
+
+ /* If the request is to clear the global performance monitor */
+ if (req->flags & DRM_ETHOSU_PERFMON_CLEAR_GLOBAL) {
+ if (!ethosu->global_perfmon)
In case of failure, I believe you need to put the perfmon to avoid
leaking a reference.
Best regards,
- Maíra
+ return -EINVAL;
+
+ xchg(ðosu->global_perfmon, NULL);
+
+ return 0;
+ }
+
+ if (cmpxchg(ðosu->global_perfmon, NULL, perfmon))
+ return -EBUSY;
+
+ return 0;
+}