Hi Rob,

On 15/05/26 00:26, Rob Herring (Arm) wrote:
The Arm Ethos-U NPUs have a PMU with performance counters. The PMU h/w
supports up to 4 (U65) or 8 (U85) counters which can be programmed for
different events. There is also a dedicated cycle counter.

The ABI and implementation are copied from the V3D driver. The main
difference in the ABI is there is no query API for the the event list.
The events differ between the U65 and U85, so the events lists are
maintained in userspace along with other differences between the U65 and
U85.

The cycle counter is always enabled when the PMU is enabled. When the
user requests N events, reading the counters will return the N events
plus the cycle counter.

Signed-off-by: Rob Herring (Arm) <[email protected]>
---
v2:
  - Use XArray instead of idr
  - Rework locking to use per device spinlock to protect modifying active
    perfmon. Based on pending V3D changes:
    
https://lore.kernel.org/all/[email protected]/
  - Add missing perfmon puts in ethosu_ioctl_perfmon_set_global() and
    ethosu_ioctl_perfmon_get_values() error paths.
  - Fix reading number of counters on U85.
  - Add defines NPU_REG_PMCCNTR_CFG
---
  drivers/accel/ethosu/Makefile         |   2 +-
  drivers/accel/ethosu/ethosu_device.h  |  32 +++
  drivers/accel/ethosu/ethosu_drv.c     |  21 +-
  drivers/accel/ethosu/ethosu_drv.h     |  62 +++++-
  drivers/accel/ethosu/ethosu_job.c     |  41 +++-
  drivers/accel/ethosu/ethosu_job.h     |   2 +
  drivers/accel/ethosu/ethosu_perfmon.c | 296 ++++++++++++++++++++++++++
  include/uapi/drm/ethosu_accel.h       |  59 ++++-
  8 files changed, 500 insertions(+), 15 deletions(-)
  create mode 100644 drivers/accel/ethosu/ethosu_perfmon.c


[...]

diff --git a/drivers/accel/ethosu/ethosu_drv.h 
b/drivers/accel/ethosu/ethosu_drv.h
index 9e21dfe94184..8fed43c2a7af 100644
--- a/drivers/accel/ethosu/ethosu_drv.h
+++ b/drivers/accel/ethosu/ethosu_drv.h
@@ -1,15 +1,75 @@
  /* SPDX-License-Identifier: GPL-2.0-only OR MIT */
-/* Copyright 2025 Arm, Ltd. */
+/* Copyright 2025-2026 Arm, Ltd. */
  #ifndef __ETHOSU_DRV_H__
  #define __ETHOSU_DRV_H__
+#include <linux/mutex.h>
+#include <linux/xarray.h>
  #include <drm/gpu_scheduler.h>
struct ethosu_device;
+struct drm_device;
+struct drm_file;
struct ethosu_file_priv {
        struct ethosu_device *edev;
        struct drm_sched_entity sched_entity;
+       struct xarray perfmons;
  };
+/* Performance monitor object. The perform lifetime is controlled by userspace
+ * using perfmon related ioctls. A perfmon can be attached to a submit_cl

submit_cl?

+ * request, and when this is the case, HW perf counters will be activated just
+ * before the submit_cl is submitted to the GPU and disabled when the job is

Maybe replace the "submit_cl is submitted to the GPU" with NPU related
wording?

+ * done. This way, only events related to a specific job will be counted.
+ */
+struct ethosu_perfmon {
+       /* Tracks the number of users of the perfmon, when this counter reaches
+        * zero the perfmon is destroyed.
+        */
+       refcount_t refcnt;
+
+       /* Number of counters activated in this perfmon instance
+        * (should be less than or equal to DRM_ETHOSU_MAX_PERF_COUNTERS).
+        */
+       u8 ncounters;
+
+       /* Events counted by the HW perf counters. */
+       u16 counters[DRM_ETHOSU_MAX_PERF_EVENT_COUNTERS];
+
+       /*
+        * Storage for counter values. Counters are incremented by the HW
+        * perf counter values every time the perfmon is attached to a
+        * NPU job. This way, perfmon users don't have to retrieve the
+        * results after each job if they want to track events covering
+        * several submissions. Note that counter values can't be reset,
+        * but you can fake a reset by destroying the perfmon and
+        * creating a new one.
+        */
+       u64 values[] __counted_by(ncounters);
+};
+
+/* ethosu_perfmon.c */
+void ethosu_perfmon_init(struct ethosu_device *ethosu);
+void ethosu_perfmon_get(struct ethosu_perfmon *perfmon);
+void ethosu_perfmon_put(struct ethosu_perfmon *perfmon);
+void ethosu_perfmon_start(struct ethosu_device *ethosu,
+                         struct ethosu_perfmon *perfmon);
+void ethosu_perfmon_stop(struct ethosu_device *ethosu,
+                        struct ethosu_perfmon *perfmon, bool capture);
+void ethosu_perfmon_stop_locked(struct ethosu_device *ethosu, struct 
ethosu_perfmon *perfmon,
+                               bool capture);
+struct ethosu_perfmon *ethosu_perfmon_find(struct ethosu_file_priv 
*ethosu_priv,
+                                          int id);
+void ethosu_perfmon_open_file(struct ethosu_file_priv *ethosu_priv);
+void ethosu_perfmon_close_file(struct ethosu_file_priv *ethosu_priv);
+int ethosu_ioctl_perfmon_create(struct drm_device *dev, void *data,
+                               struct drm_file *file_priv);
+int ethosu_ioctl_perfmon_destroy(struct drm_device *dev, void *data,
+                                struct drm_file *file_priv);
+int ethosu_ioctl_perfmon_get_values(struct drm_device *dev, void *data,
+                                   struct drm_file *file_priv);
+int ethosu_ioctl_perfmon_set_global(struct drm_device *dev, void *data,
+                                   struct drm_file *file_priv);
+
  #endif
diff --git a/drivers/accel/ethosu/ethosu_job.c 
b/drivers/accel/ethosu/ethosu_job.c
index e7b07cdbcced..5712848236af 100644
--- a/drivers/accel/ethosu/ethosu_job.c
+++ b/drivers/accel/ethosu/ethosu_job.c
@@ -1,6 +1,6 @@
  // SPDX-License-Identifier: GPL-2.0-only OR MIT
  /* Copyright 2024-2025 Tomeu Vizoso <[email protected]> */
-/* Copyright 2025 Arm, Ltd. */
+/* Copyright 2025-2026 Arm, Ltd. */
#include <linux/bitfield.h>
  #include <linux/genalloc.h>
@@ -147,6 +147,8 @@ static void ethosu_job_err_cleanup(struct ethosu_job *job)
  {
        unsigned int i;
+ ethosu_perfmon_put(job->perfmon);
+
        for (i = 0; i < job->region_cnt; i++)
                drm_gem_object_put(job->region_bo[i]);
@@ -181,6 +183,26 @@ static void ethosu_job_free(struct drm_sched_job *sched_job)
        ethosu_job_put(job);
  }
+static void
+ethosu_switch_perfmon(struct ethosu_device *ethosu, struct ethosu_job *job)
+{
+       struct ethosu_perfmon *perfmon;
+
+       guard(spinlock)(&ethosu->perfmon_state.lock);
+
+       perfmon = READ_ONCE(ethosu->global_perfmon);

Considering that you are under the spinlock, I believe that using
READ_ONCE is unneeded.

+       if (!perfmon)
+               perfmon = job->perfmon;
+
+       if (perfmon == ethosu->perfmon_state.active)
+               return;
+
+       ethosu_perfmon_stop_locked(ethosu, ethosu->perfmon_state.active, true);
+
+       if (perfmon && ethosu->perfmon_state.active != perfmon)
+               ethosu_perfmon_start(ethosu, perfmon);

Also, if you are going to keep ethosu_switch_perfmon(), you don't need
to use a spinlock. In V3D, I'm only using a spinlock because I'm
stopping the perfmon in a IRQ context. But considering that you are
stopping the perfmon in run_job(), you can use a sleeping mutex.

Moreover, I believe that if (perfmon != NULL) then automatically
(ethosu->perfmon_state.active != perfmon) as we already checked if
(perfmon == ethosu->perfmon_state.active).

Best regards,
- Maíra

Reply via email to