Hi Rob,

On Fri, Mar 6, 2026 at 5:38 PM Rob Herring (Arm) <[email protected]> wrote:
>
> The Arm Ethos-U NPUs have a PMU with performance counters. The PMU h/w
> supports up to 4 (U65) or 8 (U85) counters which can be programmed for
> different events. There is also a dedicated cycle counter.
>
> The ABI and implementation are copied from the V3D driver. The main
> difference in the ABI is there is no query API for the the event list.
> The events differ between the U65 and U85, so the events lists are
> maintained in userspace along with other differences between the U65 and
> U85.
>
> The cycle counter is always enabled when the PMU is enabled. When the
> user requests N events, reading the counters will return the N events
> plus the cycle counter.
>
> Signed-off-by: Rob Herring (Arm) <[email protected]>
> ---

<snip>

>
>  struct ethosu_file_priv {
>         struct ethosu_device *edev;
>         struct drm_sched_entity sched_entity;
> +
> +       struct {
> +               struct idr idr;
> +               struct mutex lock;
> +       } perfmon;
> +};
> +
> +/* Performance monitor object. The perform lifetime is controlled by 
> userspace
> + * using perfmon related ioctls. A perfmon can be attached to a submit_cl

I guess submit_cl is more of a v3d thing?

> + * request, and when this is the case, HW perf counters will be activated 
> just
> + * before the submit_cl is submitted to the GPU and disabled when the job is
> + * done. This way, only events related to a specific job will be counted.
> + */
> +struct ethosu_perfmon {
> +       /* Tracks the number of users of the perfmon, when this counter 
> reaches
> +        * zero the perfmon is destroyed.
> +        */
> +       refcount_t refcnt;
> +
> +       /* Protects perfmon stop, as it can be invoked from multiple places. 
> */
> +       struct mutex lock;
> +
> +       /* Number of counters activated in this perfmon instance
> +        * (should be less than or equal to DRM_ETHOSU_MAX_PERF_COUNTERS).
> +        */
> +       u8 ncounters;
> +
> +       /* Events counted by the HW perf counters. */
> +       u16 counters[DRM_ETHOSU_MAX_PERF_EVENT_COUNTERS];
> +
> +       /* Storage for counter values. Counters are incremented by the
> +        * HW perf counter values every time the perfmon is attached
> +        * to a GPU job.  This way, perfmon users don't have to
> +        * retrieve the results after each job if they want to track
> +        * events covering several submissions.  Note that counter
> +        * values can't be reset, but you can fake a reset by
> +        * destroying the perfmon and creating a new one.
> +        */
> +       u64 values[] __counted_by(ncounters);
>  };

<snip>

> +void ethosu_perfmon_start(struct ethosu_device *ethosu, struct 
> ethosu_perfmon *perfmon)
> +{
> +       unsigned int i;
> +       u8 ncounters;
> +       u32 mask;
> +
> +       if (WARN_ON_ONCE(!perfmon || ethosu->active_perfmon))
> +               return;
> +
> +       writel_relaxed(PMCR_CNT_EN, ethosu->pmu_regs + NPU_REG_PMCR);
> +       writel_relaxed(0x000011, ethosu->pmu_regs + NPU_REG_PMCCNTR_CFG);

Can we use macros for this value so we have it documented in the source code?

<snip>

> +void ethosu_perfmon_stop(struct ethosu_device *ethosu, struct ethosu_perfmon 
> *perfmon,
> +                     bool capture)
> +{
> +       unsigned int i;
> +       u8 ncounters;
> +       u32 mask;
> +
> +       if (!perfmon || !ethosu->active_perfmon)
> +               return;
> +
> +       mutex_lock(&perfmon->lock);
> +       if (perfmon != ethosu->active_perfmon) {
> +               mutex_unlock(&perfmon->lock);
> +               return;
> +       }
> +
> +       ncounters = perfmon->ncounters - 1;
> +
> +       if (capture) {
> +               for (i = 0; i < ncounters; i++)
> +                       perfmon->values[i] += readl_relaxed(ethosu->pmu_regs 
> + NPU_REG_PMU_EVCNTR(i));
> +               perfmon->values[ncounters] +=
> +                       readl_relaxed(ethosu->pmu_regs + NPU_REG_PMCCNTR_LO) |
> +                       (u64)readl_relaxed(ethosu->pmu_regs + 
> NPU_REG_PMCCNTR_HI) << 32;
> +       }
> +
> +       mask = 0x80000000;
> +       if (ncounters)
> +               mask |= GENMASK(ncounters - 1, 0);
> +       writel_relaxed(mask, ethosu->pmu_regs + NPU_REG_PMCNTENCLR);
> +
> +       writel_relaxed(0, ethosu->pmu_regs + NPU_REG_PMCR);
> +       ethosu->active_perfmon = NULL;
> +       mutex_unlock(&perfmon->lock);

Wonder if the npu-idle counter won't continue to accumulate until we
call stop(), and in that case, if we shouldn't stop the counters on
the job IRQ. So we can know for how long the NPU was idle during the
job, without extra idle time.

I think we could freeze the counters in the job_done IRQ, and harvest
the values on job cleanup. At freeze time it would be good to also
capture a timestamp and return to userspace in get_values, so that
userspace doesn't have to take it with considerable delay (for tools
such as Perfetto).

<snip>

> +int ethosu_ioctl_perfmon_create(struct drm_device *dev, void *data,
> +                            struct drm_file *file_priv)
> +{
> +       struct ethosu_file_priv *ethosu_priv = file_priv->driver_priv;
> +       struct drm_ethosu_perfmon_create *req = data;
> +       struct ethosu_device *ethosu = to_ethosu_device(dev);
> +       struct ethosu_perfmon *perfmon;
> +       unsigned int i, event_max;
> +       int ret;
> +
> +       /* Number of monitored counters cannot exceed HW limits. */
> +       if (req->ncounters > ethosu->npu_info.pmu_counters)
> +               return -EINVAL;
> +
> +       /* Make sure all counters are valid. */
> +       event_max = ethosu_is_u65(ethosu) ? 433 : 671;

Can we have macros for these constants?

<snip>

> +int ethosu_ioctl_perfmon_get_values(struct drm_device *dev, void *data,
> +                                struct drm_file *file_priv)
> +{
> +       struct ethosu_device *ethosu = to_ethosu_device(dev);
> +       struct ethosu_file_priv *ethosu_priv = file_priv->driver_priv;
> +       struct drm_ethosu_perfmon_get_values *req = data;
> +       struct ethosu_perfmon *perfmon;
> +       int ret = 0;
> +
> +       if (req->pad != 0)
> +               return -EINVAL;
> +
> +       perfmon = ethosu_perfmon_find(ethosu_priv, req->id);
> +       if (!perfmon)
> +               return -EINVAL;
> +
> +       ret = pm_runtime_resume_and_get(dev->dev);
> +       if (ret)

Should we cal put() here?

> +               return ret;
> +
> +       ethosu_perfmon_stop(ethosu, perfmon, true);
> +
> +       pm_runtime_put_autosuspend(dev->dev);
> +
> +       if (copy_to_user(u64_to_user_ptr(req->values_ptr), perfmon->values,
> +                        perfmon->ncounters * sizeof(u64)))

And here?

> +               ret = -EFAULT;
> +
> +       ethosu_perfmon_put(perfmon);
> +
> +       return ret;
> +}
> +
> +int ethosu_ioctl_perfmon_set_global(struct drm_device *dev, void *data,
> +                                struct drm_file *file_priv)
> +{
> +       struct ethosu_file_priv *ethosu_priv = file_priv->driver_priv;
> +       struct drm_ethosu_perfmon_set_global *req = data;
> +       struct ethosu_device *ethosu = to_ethosu_device(dev);
> +       struct ethosu_perfmon *perfmon;
> +
> +       if (req->flags & ~DRM_ETHOSU_PERFMON_CLEAR_GLOBAL)
> +               return -EINVAL;
> +
> +       perfmon = ethosu_perfmon_find(ethosu_priv, req->id);

Aren't we leaking this second reference?

Thanks!

Tomeu

> +       if (!perfmon)
> +               return -EINVAL;
> +
> +       /* If the request is to clear the global performance monitor */
> +       if (req->flags & DRM_ETHOSU_PERFMON_CLEAR_GLOBAL) {
> +               if (!ethosu->global_perfmon)
> +                       return -EINVAL;
> +
> +               xchg(&ethosu->global_perfmon, NULL);
> +
> +               return 0;
> +       }
> +
> +       if (cmpxchg(&ethosu->global_perfmon, NULL, perfmon))
> +               return -EBUSY;
> +
> +       return 0;
> +}
> diff --git a/include/uapi/drm/ethosu_accel.h b/include/uapi/drm/ethosu_accel.h
> index af78bb4686d7..dde6756642ea 100644
> --- a/include/uapi/drm/ethosu_accel.h
> +++ b/include/uapi/drm/ethosu_accel.h
> @@ -43,6 +43,11 @@ enum drm_ethosu_ioctl_id {
>
>         /** @DRM_ETHOSU_SUBMIT: Submit a job and BOs to run. */
>         DRM_ETHOSU_SUBMIT,
> +
> +       DRM_ETHOSU_PERFMON_CREATE,
> +       DRM_ETHOSU_PERFMON_DESTROY,
> +       DRM_ETHOSU_PERFMON_GET_VALUES,
> +       DRM_ETHOSU_PERFMON_SET_GLOBAL,
>  };
>
>  /**
> @@ -79,6 +84,7 @@ struct drm_ethosu_npu_info {
>         __u32 config;
>
>         __u32 sram_size;
> +       __u32 pmu_counters;
>  };
>
>  /**
> @@ -220,8 +226,51 @@ struct drm_ethosu_submit {
>         /** Input: Number of jobs passed in. */
>         __u32 job_count;
>
> -       /** Reserved, must be zero. */
> +       /** Input: Id returned by DRM_ETHOSU_PERFMON_CREATE */
> +       __u32 perfmon_id;
> +};
> +
> +#define DRM_ETHOSU_MAX_PERF_EVENT_COUNTERS     8
> +#define DRM_ETHOSU_MAX_PERF_COUNTERS \
> +       (DRM_ETHOSU_MAX_PERF_EVENT_COUNTERS + 1)
> +
> +struct drm_ethosu_perfmon_create {
> +       __u32 id;
> +       __u32 ncounters;
> +       __u16 counters[DRM_ETHOSU_MAX_PERF_EVENT_COUNTERS];
> +};
> +
> +struct drm_ethosu_perfmon_destroy {
> +       __u32 id;
> +};
> +
> +/*
> + * Returns the values of the performance counters tracked by this
> + * perfmon (as an array of (ncounters + 1) u64 values).
> + *
> + * No implicit synchronization is performed, so the user has to
> + * guarantee that any jobs using this perfmon have already been
> + * completed.
> + */
> +struct drm_ethosu_perfmon_get_values {
> +       __u32 id;
>         __u32 pad;
> +       __u64 values_ptr;
> +};
> +
> +#define DRM_ETHOSU_PERFMON_CLEAR_GLOBAL    0x0001
> +
> +/**
> + * struct drm_ethosu_perfmon_set_global - ioctl to define a global 
> performance
> + * monitor
> + *
> + * The global performance monitor will be used for all jobs. If a global
> + * performance monitor is defined, jobs with a self-defined performance
> + * monitor won't be allowed.
> + */
> +struct drm_ethosu_perfmon_set_global {
> +       __u32 flags;
> +       __u32 id;
>  };
>
>  /**
> @@ -252,6 +301,14 @@ enum {
>                 DRM_IOCTL_ETHOSU(WR, CMDSTREAM_BO_CREATE, 
> cmdstream_bo_create),
>         DRM_IOCTL_ETHOSU_SUBMIT =
>                 DRM_IOCTL_ETHOSU(WR, SUBMIT, submit),
> +       DRM_IOCTL_ETHOSU_PERFMON_CREATE =
> +               DRM_IOCTL_ETHOSU(WR, PERFMON_CREATE, perfmon_create),
> +       DRM_IOCTL_ETHOSU_PERFMON_DESTROY =
> +               DRM_IOCTL_ETHOSU(WR, PERFMON_DESTROY, perfmon_destroy),
> +       DRM_IOCTL_ETHOSU_PERFMON_GET_VALUES =
> +               DRM_IOCTL_ETHOSU(WR, PERFMON_GET_VALUES, perfmon_get_values),
> +       DRM_IOCTL_ETHOSU_PERFMON_SET_GLOBAL =
> +               DRM_IOCTL_ETHOSU(WR, PERFMON_SET_GLOBAL, perfmon_set_global),
>  };
>
>  #if defined(__cplusplus)
> --
> 2.51.0
>

Reply via email to