Resending partial review for patch 6/8 because of typing errors in the previous 
one.
Sorry about the hassle this might've caused.

On 11.12.2024 16:50, Lukas Zapolskas wrote:
> From: Adrián Larumbe <adrian.laru...@collabora.com>
> 
> The sampler aggregates counter and set requests coming from userspace
> and mediates interactions with the FW interface, to ensure that user
> sessions cannot override the global configuration.
> 
> From the top-level interface, the sampler supports two different types
> of samples: clearing samples and regular samples. Clearing samples are
> a special sample type that allow for the creation of a sampling
> baseline, to ensure that a session does not obtain counter data from
> before its creation.
> 
> Upon receipt of a relevant interrupt, corresponding to one of the three
> relevant bits of the GLB_ACK register, the sampler takes any samples
> that occurred, and, based on the insert and extract indices, accumulates
> them to an internal storage buffer after zero-extending the counters
> from the 32-bit counters emitted by the hardware to 64-bit counters
> for internal accumulation.
> 
> When the performance counters are enabled, the FW ensures no counter
> data is lost when entering and leaving non-counting regions by producing
> automatic samples that do not correspond to a GLB_REQ.PRFCNT_SAMPLE
> request. Such regions may be per hardware unit, such as when a shader
> core powers down, or global. Most of these events do not directly
> correspond to session sample requests, so any intermediary counter data
> must be stored into a temporary accumulation buffer.
> 
> If there are sessions waiting for a sample, this accumulated buffer will
> be taken, and emitted for each waiting client. During this phase,
> information like the timestamps of sample request and sample emission,
> type of the counter block and block index annotations are added to the
> sample header and block headers. If no sessions are waiting for
> a sample, this accumulation buffer is kept until the next time a sample
> is requested.
> 
> Special handling is needed for the PRFCNT_OVERFLOW interrupt, which is
> an indication that the internal sample handling rate was insufficient.
> 
> The sampler also maintains a buffer descriptor indicating the structure
> of a firmware sample, since neither the firmware nor the hardware give
> any indication of the sample structure, only that it is composed out of
> three parts:
>  - the metadata is an optional initial counter block on supporting
>    firmware versions that contains a single counter, indicating the
>    reason a sample was taken when entering global non-counting regions.
>    This is used to provide coarse-grained information about why a sample
>    was taken to userspace, to help userspace interpret variations in
>    counter magnitude.
>  - the firmware component of the sample is composed out of a global
>    firmware counter block on supporting firmware versions.
>  - the hardware component is the most sizeable of the three and contains
>    a block of counters for each of the underlying hardware resources. It
>    has a fixed structure that is described in the architecture
>    specification, and contains the command stream hardware block(s), the
>    tiler block(s), the MMU and L2 blocks (collectively named the memsys
>    blocks) and the shader core blocks, in that order.
> The structure of this buffer changes based on the firmware and hardware
> combination, but is constant on a single system.
> 
> Signed-off-by: Adrián Larumbe <adrian.laru...@collabora.com>
> Co-developed-by: Lukas Zapolskas <lukas.zapols...@arm.com>
> Signed-off-by: Lukas Zapolskas <lukas.zapols...@arm.com>
> ---
>  drivers/gpu/drm/panthor/panthor_fw.c   |   5 +
>  drivers/gpu/drm/panthor/panthor_fw.h   |   9 +-
>  drivers/gpu/drm/panthor/panthor_perf.c | 882 ++++++++++++++++++++++++-
>  drivers/gpu/drm/panthor/panthor_perf.h |   2 +
>  include/uapi/drm/panthor_drm.h         |   5 +-
>  5 files changed, 892 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/panthor/panthor_fw.c 
> b/drivers/gpu/drm/panthor/panthor_fw.c
> index e9530d1d9781..cd68870ced18 100644
> --- a/drivers/gpu/drm/panthor/panthor_fw.c
> +++ b/drivers/gpu/drm/panthor/panthor_fw.c
> @@ -1000,9 +1000,12 @@ static void panthor_fw_init_global_iface(struct 
> panthor_device *ptdev)
>  
>       /* Enable interrupts we care about. */
>       glb_iface->input->ack_irq_mask = GLB_CFG_ALLOC_EN |
> +                                      GLB_PERFCNT_SAMPLE |
>                                        GLB_PING |
>                                        GLB_CFG_PROGRESS_TIMER |
>                                        GLB_CFG_POWEROFF_TIMER |
> +                                      GLB_PERFCNT_THRESHOLD |
> +                                      GLB_PERFCNT_OVERFLOW |
>                                        GLB_IDLE_EN |
>                                        GLB_IDLE;
>  
> @@ -1031,6 +1034,8 @@ static void panthor_job_irq_handler(struct 
> panthor_device *ptdev, u32 status)
>               return;
>  
>       panthor_sched_report_fw_events(ptdev, status);
> +
> +     panthor_perf_report_irq(ptdev, status);
>  }
>  PANTHOR_IRQ_HANDLER(job, JOB, panthor_job_irq_handler);
>  
> diff --git a/drivers/gpu/drm/panthor/panthor_fw.h 
> b/drivers/gpu/drm/panthor/panthor_fw.h
> index db10358e24bb..7ed34d2de8b4 100644
> --- a/drivers/gpu/drm/panthor/panthor_fw.h
> +++ b/drivers/gpu/drm/panthor/panthor_fw.h
> @@ -199,9 +199,10 @@ struct panthor_fw_global_control_iface {
>       u32 group_num;
>       u32 group_stride;
>  #define GLB_PERFCNT_FW_SIZE(x) ((((x) >> 16) << 8))
> +#define GLB_PERFCNT_HW_SIZE(x) (((x) & GENMASK(15, 0)) << 8)
>       u32 perfcnt_size;
>       u32 instr_features;
> -#define PERFCNT_FEATURES_MD_SIZE(x) ((x) & GENMASK(3, 0))
> +#define PERFCNT_FEATURES_MD_SIZE(x) (((x) & GENMASK(3, 0)) << 8)
>       u32 perfcnt_features;
>  };
>  
> @@ -211,7 +212,7 @@ struct panthor_fw_global_input_iface {
>  #define GLB_CFG_ALLOC_EN                     BIT(2)
>  #define GLB_CFG_POWEROFF_TIMER                       BIT(3)
>  #define GLB_PROTM_ENTER                              BIT(4)
> -#define GLB_PERFCNT_EN                               BIT(5)
> +#define GLB_PERFCNT_ENABLE                   BIT(5)
>  #define GLB_PERFCNT_SAMPLE                   BIT(6)
>  #define GLB_COUNTER_EN                               BIT(7)
>  #define GLB_PING                             BIT(8)
> @@ -234,7 +235,6 @@ struct panthor_fw_global_input_iface {
>       u32 doorbell_req;
>       u32 reserved1;
>       u32 progress_timer;
> -
>  #define GLB_TIMER_VAL(x)                     ((x) & GENMASK(30, 0))
>  #define GLB_TIMER_SOURCE_GPU_COUNTER         BIT(31)
>       u32 poweroff_timer;
> @@ -244,6 +244,9 @@ struct panthor_fw_global_input_iface {
>       u64 perfcnt_base;
>       u32 perfcnt_extract;
>       u32 reserved3[3];
> +#define GLB_PRFCNT_CONFIG_SIZE(x) ((x) & GENMASK(7, 0))
> +#define GLB_PRFCNT_CONFIG_SET(x) (((x) & GENMASK(1, 0)) << 8)
> +#define GLB_PRFCNT_METADATA_ENABLE BIT(10)
>       u32 perfcnt_config;
>       u32 perfcnt_csg_select;
>       u32 perfcnt_fw_enable;
> diff --git a/drivers/gpu/drm/panthor/panthor_perf.c 
> b/drivers/gpu/drm/panthor/panthor_perf.c
> index 42d8b6f8c45d..d62d97c448da 100644
> --- a/drivers/gpu/drm/panthor/panthor_perf.c
> +++ b/drivers/gpu/drm/panthor/panthor_perf.c
> @@ -15,7 +15,9 @@
>  
>  #include "panthor_device.h"
>  #include "panthor_fw.h"
> +#include "panthor_gem.h"
>  #include "panthor_gpu.h"
> +#include "panthor_mmu.h"
>  #include "panthor_perf.h"
>  #include "panthor_regs.h"
>  
> @@ -26,6 +28,41 @@
>   */
>  #define PANTHOR_PERF_EM_BITS (BITS_PER_TYPE(u64) * 2)
>  
> +/**
> + * PANTHOR_PERF_FW_RINGBUF_SLOTS - Number of slots allocated for individual 
> samples when configuring
> + *                                 the performance counter ring buffer to 
> firmware. This can be
> + *                                 used to reduce memory consumption on low 
> memory systems.
> + */
> +#define PANTHOR_PERF_FW_RINGBUF_SLOTS (32)
> +
> +/**
> + * PANTHOR_CTR_TIMESTAMP_LO - The first architecturally mandated counter of 
> every block type
> + *                            contains the low 32-bits of the TIMESTAMP 
> value.
> + */
> +#define PANTHOR_CTR_TIMESTAMP_LO (0)
> +
> +/**
> + * PANTHOR_CTR_TIMESTAMP_HI - The register offset containinig the high 
> 32-bits of the TIMESTAMP
> + *                            value.
> + */
> +#define PANTHOR_CTR_TIMESTAMP_HI (1)
> +
> +/**
> + * PANTHOR_CTR_PRFCNT_EN - The register offset containing the enable mask 
> for the enabled counters
> + *                         that were written to memory.
> + */
> +#define PANTHOR_CTR_PRFCNT_EN (2)
> +
> +/**
> + * PANTHOR_HEADER_COUNTERS - The first four counters of every block type are 
> architecturally
> + *                           defined to be equivalent. The fourth counter is 
> always reserved,
> + *                           and should be zero and as such, does not have a 
> separate define.
> + *
> + *                           These are the only four counters that are the 
> same between different
> + *                           blocks and are consistent between different 
> architectures.
> + */
> +#define PANTHOR_HEADER_COUNTERS (4)
> +
>  /**
>   * enum panthor_perf_session_state - Session state bits.
>   */
> @@ -158,6 +195,135 @@ struct panthor_perf_session {
>       struct kref ref;
>  };
>  
> +struct panthor_perf_buffer_descriptor {
> +     /**
> +      * @block_size: The size of a single block in the FW ring buffer, equal 
> to
> +      *              sizeof(u32) * counters_per_block.
> +      */
> +     size_t block_size;
> +
> +     /**
> +      * @buffer_size: The total size of the buffer, equal to (#hardware 
> blocks +
> +      *               #firmware blocks) * block_size.
> +      */
> +     size_t buffer_size;
> +
> +     /**
> +      * @available_blocks: Bitmask indicating the blocks supported by the 
> hardware and firmware
> +      *                    combination. Note that this can also include 
> blocks that will not
> +      *                    be exposed to the user.
> +      */
> +     DECLARE_BITMAP(available_blocks, DRM_PANTHOR_PERF_BLOCK_MAX);
> +     struct {
> +             /** @offset: Starting offset of a block of type @type in the FW 
> ringbuffer. */
> +             size_t offset;
> +
> +             /** @type: Type of the blocks between @blocks[i].offset and 
> @blocks[i+1].offset. */
> +             enum drm_panthor_perf_block_type type;

I think perhaps you could avoid declaring the type member, because a block type 
is the same as its
index in the blocks array. See [1]

> +             /** @block_count: Number of blocks of the given @type, starting 
> at @offset. */
> +             size_t block_count;
> +     } blocks[DRM_PANTHOR_PERF_BLOCK_MAX];
> +};
> +
> +
> +/**
> + * struct panthor_perf_sampler - Interface to de-multiplex firmware 
> interaction and handle
> + *                               global interactions.
> + */
> +struct panthor_perf_sampler {
> +     /** @sample_requested: A sample has been requested. */
> +     bool sample_requested;
> +
> +     /**
> +      * @last_ack: Temporarily storing the last GLB_ACK status. Without 
> storing this data,
> +      *            we do not know whether a toggle bit has been handled.
> +      */
> +     u32 last_ack;
> +
> +     /**
> +      * @enabled_clients: The number of clients concurrently requesting 
> samples. To ensure that
> +      *                   one client cannot deny samples to another, we must 
> ensure that clients
> +      *                   are effectively reference counted.
> +      */
> +     atomic_t enabled_clients;
> +
> +     /**
> +      * @sample_handled: Synchronization point between the interrupt bottom 
> half and the
> +      *                  main sampler interface. Must be re-armed solely on 
> a new request
> +      *                  coming to the sampler.
> +      */
> +     struct completion sample_handled;
> +
> +     /** @rb: Kernel BO in the FW AS containing the sample ringbuffer. */
> +     struct panthor_kernel_bo *rb;
> +
> +     /**
> +      * @sample_size: The size of a single sample in the FW ringbuffer. This 
> is computed using
> +      *               the hardware configuration according to the 
> architecture specification,
> +      *               and cross-validated against the sample size reported 
> by FW to ensure
> +      *               a consistent view of the buffer size.
> +      */
> +     size_t sample_size;
> +
> +     /**
> +      * @sample_slots: Number of slots for samples in the FW ringbuffer. 
> Could be static,
> +      *                but may be useful to customize for low-memory devices.
> +      */
> +     size_t sample_slots;
> +
> +     /**
> +      * @config_lock: Lock serializing changes to the global counter 
> configuration, including
> +      *               requested counter set and the counters themselves.
> +      */
> +     struct mutex config_lock;
> +
> +     /**
> +      * @ems: List of enable maps of the active sessions. When removing a 
> session, the number
> +      *       of requested counters may decrease, and the union of enable 
> masks from the multiple
> +      *       sessions does not provide sufficient information to 
> reconstruct the previous
> +      *       enable mask.
> +      */
> +     struct list_head ems;
> +
> +     /** @em: Combined enable mask for all of the active sessions. */
> +     struct panthor_perf_enable_masks *em;
> +
> +     /**
> +      * @desc: Buffer descriptor for a sample in the FW ringbuffer. Note 
> that this buffer
> +      *        at current time does some interesting things with the zeroth 
> block type. On
> +      *        newer FW revisions, the first counter block of the sample is 
> the METADATA block,
> +      *        which contains a single value indicating the reason the 
> sample was taken (if
> +      *        any). This block must not be exposed to userspace, as 
> userspace does not
> +      *        have sufficient context to interpret it. As such, this block 
> type is not
> +      *        added to the uAPI, but we still use it in the kernel.
> +      */
> +     struct panthor_perf_buffer_descriptor desc;
> +
> +     /**
> +      * @sample: Pointer to an upscaled and annotated sample that may be 
> emitted to userspace.
> +      *          This is used both as an intermediate buffer to do the 
> zero-extension of the
> +      *          32-bit counters to 64-bits and as a storage buffer in case 
> the sampler
> +      *          requests an additional sample that was not requested by any 
> of the top-level
> +      *          sessions (for instance, when changing the enable masks).
> +      */
> +     u8 *sample;
> +
> +     /** @sampler_lock: Lock used to guard the list of sessions requesting 
> samples. */
> +     struct mutex sampler_lock;
> +
> +     /** @sampler_list: List of sessions requesting samples. */
> +     struct list_head sampler_list;
> +
> +     /** @set_config: The set that will be configured onto the hardware. */
> +     u8 set_config;
> +
> +     /**
> +      * @ptdev: Backpointer to the Panthor device, needed to ring the global 
> doorbell and
> +      *         interface with FW.
> +      */
> +     struct panthor_device *ptdev;
> +};
>  
>  struct panthor_perf {
>       /**
> @@ -175,6 +341,9 @@ struct panthor_perf {
>        * @sessions: Global map of sessions, accessed by their ID.
>        */
>       struct xarray sessions;
> +
> +     /** @sampler: FW control interface. */
> +     struct panthor_perf_sampler sampler;
>  };
>  
>  /**
> @@ -247,6 +416,23 @@ static struct panthor_perf_enable_masks 
> *panthor_perf_create_em(struct drm_panth
>       return em;
>  }
>  
> +static void panthor_perf_em_add(struct panthor_perf_enable_masks *dst_em,
> +             const struct panthor_perf_enable_masks *const src_em)
> +{
> +     size_t i = 0;
> +
> +     for (i = DRM_PANTHOR_PERF_BLOCK_FW; i <= DRM_PANTHOR_PERF_BLOCK_LAST; 
> i++)
> +             bitmap_or(dst_em->mask[i], dst_em->mask[i], src_em->mask[i], 
> PANTHOR_PERF_EM_BITS);
> +}
> +
> +static void panthor_perf_em_zero(struct panthor_perf_enable_masks *em)
> +{
> +     size_t i = 0;
> +
> +     for (i = DRM_PANTHOR_PERF_BLOCK_FW; i <= DRM_PANTHOR_PERF_BLOCK_LAST; 
> i++)
> +             bitmap_zero(em->mask[i], PANTHOR_PERF_EM_BITS);
> +}
> +
>  static void panthor_perf_destroy_em_kref(struct kref *em_kref)
>  {
>       struct panthor_perf_enable_masks *em = container_of(em_kref, 
> typeof(*em), refs);
> @@ -270,6 +456,12 @@ static u32 session_read_extract_idx(struct 
> panthor_perf_session *session)
>       return smp_load_acquire(session->extract_idx);
>  }
>  
> +static void session_write_insert_idx(struct panthor_perf_session *session, 
> u32 idx)
> +{
> +     /* Userspace needs the insert index to know where to look for the 
> sample. */
> +     smp_store_release(session->insert_idx, idx);
> +}
> +
>  static u32 session_read_insert_idx(struct panthor_perf_session *session)
>  {
>       return *session->insert_idx;
> @@ -349,6 +541,70 @@ static struct panthor_perf_session *session_find(struct 
> panthor_file *pfile,
>       return session;
>  }
>  
> +static u32 compress_enable_mask(unsigned long *const src)
> +{
> +     size_t i;
> +     u32 result = 0;
> +     unsigned long clump;
> +
> +     for_each_set_clump8(i, clump, src, PANTHOR_PERF_EM_BITS) {
> +             const unsigned long shift = div_u64(i, 4);
> +
> +             result |= !!(clump & GENMASK(3, 0)) << shift;
> +             result |= !!(clump & GENMASK(7, 4)) << (shift + 1);
> +     }
> +
> +     return result;
> +}
> +
> +static void expand_enable_mask(u32 em, unsigned long *const dst)
> +{
> +     size_t i;
> +     DECLARE_BITMAP(emb, BITS_PER_TYPE(u32));
> +
> +     bitmap_from_arr32(emb, &em, BITS_PER_TYPE(u32));
> +
> +     for_each_set_bit(i, emb, BITS_PER_TYPE(u32))
> +             bitmap_set(dst, i * 4, 4);
> +}
> +
> +/**
> + * panthor_perf_block_data - Identify the block index and type based on the 
> offset.
> + *
> + * @desc:   FW buffer descriptor.
> + * @offset: The current offset being examined.
> + * @idx:    Pointer to an output index.
> + * @type:   Pointer to an output block type.
> + *
> + * To disambiguate different types of blocks as well as different blocks of 
> the same type,
> + * the offset into the FW ringbuffer is used to uniquely identify the block 
> being considered.
> + *
> + * In the future, this is a good time to identify whether a block will be 
> empty,
> + * allowing us to short-circuit its processing after emitting header 
> information.
> + */
> +static void panthor_perf_block_data(struct panthor_perf_buffer_descriptor 
> *const desc,
> +             size_t offset, u32 *idx, enum drm_panthor_perf_block_type *type)
> +{
> +     unsigned long id;
> +
> +     for_each_set_bit(id, desc->available_blocks, 
> DRM_PANTHOR_PERF_BLOCK_LAST) {
> +             const size_t block_start = desc->blocks[id].offset;
> +             const size_t block_count = desc->blocks[id].block_count;
> +             const size_t block_end = desc->blocks[id].offset +
> +                     desc->block_size * block_count;
> +
> +             if (!block_count)
> +                     continue;
> +
> +             if ((offset >= block_start) && (offset < block_end)) {
> +                     *type = desc->blocks[id].type;

  [1] I think in this case, 'id' will always be the same as 
desc->blocks[id].type, so maybe
      just return 'id' instead of the type field, and you can remove it 
altogether.

> +                     *idx = div_u64(offset - desc->blocks[id].offset, 
> desc->block_size);
> +
> +                     return;
> +             }
> +     }
> +}
> +
>  static size_t session_get_max_sample_size(const struct drm_panthor_perf_info 
> *const info)
>  {
>       const size_t block_size = 
> get_annotated_block_size(info->counters_per_block);
> @@ -358,6 +614,520 @@ static size_t session_get_max_sample_size(const struct 
> drm_panthor_perf_info *co
>       return sizeof(struct drm_panthor_perf_sample_header) + (block_size * 
> block_nr);
>  }
>  
> +static u32 panthor_perf_handle_sample(struct panthor_device *ptdev, u32 
> extract_idx, u32 insert_idx)
> +{
> +     struct panthor_perf *perf = ptdev->perf;
> +     struct panthor_perf_sampler *sampler = &ptdev->perf->sampler;
> +     const size_t ann_block_size =
> +             get_annotated_block_size(ptdev->perf_info.counters_per_block);
> +     u32 i;
> +
> +     for (i = extract_idx; i != insert_idx; i = (i + 1) % 
> sampler->sample_slots) {
> +             u8 *fw_sample = (u8 *)sampler->rb->kmap + i * 
> sampler->sample_size;
> +
> +             for (size_t fw_off = 0, ann_off = sizeof(struct 
> drm_panthor_perf_sample_header);
> +                             fw_off < sampler->desc.buffer_size;
> +                             fw_off += sampler->desc.block_size)
> +
> +             {
> +                     u32 idx;
> +                     enum drm_panthor_perf_block_type type;
> +                     DECLARE_BITMAP(expanded_em, PANTHOR_PERF_EM_BITS);
> +                     struct panthor_perf_counter_block *blk =
> +                             (typeof(blk))(perf->sampler.sample + ann_off);
> +                     const u32 prfcnt_en = 
> blk->counters[PANTHOR_CTR_PRFCNT_EN];
> +
> +                     panthor_perf_block_data(&sampler->desc, fw_off, &idx, 
> &type);
> +
> +                     /**
> +                      * TODO Data from the metadata block must be used to 
> populate the
> +                      * block state information.
> +                      */
> +                     if (type == DRM_PANTHOR_PERF_BLOCK_METADATA)
> +                             continue;
> +
> +                     expand_enable_mask(prfcnt_en, expanded_em);
> +
> +                     blk->header = (struct drm_panthor_perf_block_header) {
> +                             .clock = 0,
> +                             .block_idx = idx,
> +                             .block_type = type,
> +                             .block_states = 
> DRM_PANTHOR_PERF_BLOCK_STATE_UNKNOWN
> +                     };
> +                     bitmap_to_arr64(blk->header.enable_mask, expanded_em, 
> PANTHOR_PERF_EM_BITS);
> +
> +                     u32 *block = (u32 *)(fw_sample + fw_off);
> +
> +                     /*
> +                      * The four header counters must be treated 
> differently, because they are
> +                      * not additive. For the fourth, the assignment does 
> not matter, as it
> +                      * is reserved and should be zero.
> +                      */
> +                     blk->counters[PANTHOR_CTR_TIMESTAMP_LO] = 
> block[PANTHOR_CTR_TIMESTAMP_LO];
> +                     blk->counters[PANTHOR_CTR_TIMESTAMP_HI] = 
> block[PANTHOR_CTR_TIMESTAMP_HI];
> +                     blk->counters[PANTHOR_CTR_PRFCNT_EN] = 
> block[PANTHOR_CTR_PRFCNT_EN];
> +
> +                     for (size_t k = PANTHOR_HEADER_COUNTERS;
> +                                     k < ptdev->perf_info.counters_per_block;
> +                                     k++)
> +                             blk->counters[k] += block[k];
> +
> +                     ann_off += ann_block_size;
> +             }
> +     }
> +
> +     return i;
> +}
> +
> +static size_t panthor_perf_get_fw_reported_size(struct panthor_device *ptdev)
> +{
> +     struct panthor_fw_global_iface *glb_iface = 
> panthor_fw_get_glb_iface(ptdev);
> +
> +     size_t fw_size = GLB_PERFCNT_FW_SIZE(glb_iface->control->perfcnt_size);
> +     size_t hw_size = GLB_PERFCNT_HW_SIZE(glb_iface->control->perfcnt_size);
> +     size_t md_size = 
> PERFCNT_FEATURES_MD_SIZE(glb_iface->control->perfcnt_features);
> +
> +     return md_size + fw_size + hw_size;
> +}
> +
> +#define PANTHOR_PERF_SET_BLOCK_DESC_DATA(desc, typ, blk_count, offset) \
> +     ({ \
> +             (desc)->blocks[(typ)].type = (typ); \
> +             (desc)->blocks[(typ)].offset = (offset); \
> +             (desc)->blocks[(typ)].block_count = (blk_count);  \
> +             if ((blk_count))                                    \
> +                     set_bit((typ), (desc)->available_blocks); \
> +             (offset) + ((desc)->block_size) * (blk_count); \
> +      })
> +
> +static int panthor_perf_setup_fw_buffer_desc(struct panthor_device *ptdev,
> +             struct panthor_perf_sampler *sampler)
> +{
> +     const struct drm_panthor_perf_info *const info = &ptdev->perf_info;
> +     const size_t block_size = info->counters_per_block * sizeof(u32);
> +     struct panthor_perf_buffer_descriptor *desc = &sampler->desc;
> +     const size_t fw_sample_size = panthor_perf_get_fw_reported_size(ptdev);
> +     size_t offset = 0;
> +
> +     desc->block_size = block_size;
> +
> +     for (enum drm_panthor_perf_block_type type = 0; type < 
> DRM_PANTHOR_PERF_BLOCK_MAX; type++) {
> +             switch (type) {
> +             case DRM_PANTHOR_PERF_BLOCK_METADATA:
> +                     if (info->flags & DRM_PANTHOR_PERF_BLOCK_STATES_SUPPORT)
> +                             offset = PANTHOR_PERF_SET_BLOCK_DESC_DATA(desc,
> +                                     DRM_PANTHOR_PERF_BLOCK_METADATA, 1, 
> offset);
> +                     break;
> +             case DRM_PANTHOR_PERF_BLOCK_FW:
> +                     offset = PANTHOR_PERF_SET_BLOCK_DESC_DATA(desc, type, 
> info->fw_blocks,
> +                                     offset);
> +                     break;
> +             case DRM_PANTHOR_PERF_BLOCK_CSG:
> +                     offset = PANTHOR_PERF_SET_BLOCK_DESC_DATA(desc, type, 
> info->csg_blocks,
> +                                     offset);
> +                     break;
> +             case DRM_PANTHOR_PERF_BLOCK_CSHW:
> +                     offset = PANTHOR_PERF_SET_BLOCK_DESC_DATA(desc, type, 
> info->cshw_blocks,
> +                                     offset);
> +                     break;
> +             case DRM_PANTHOR_PERF_BLOCK_TILER:
> +                     offset = PANTHOR_PERF_SET_BLOCK_DESC_DATA(desc, type, 
> info->tiler_blocks,
> +                                     offset);
> +                     break;
> +             case DRM_PANTHOR_PERF_BLOCK_MEMSYS:
> +                     offset = PANTHOR_PERF_SET_BLOCK_DESC_DATA(desc, type, 
> info->memsys_blocks,
> +                                     offset);
> +                     break;
> +             case DRM_PANTHOR_PERF_BLOCK_SHADER:
> +                     offset = PANTHOR_PERF_SET_BLOCK_DESC_DATA(desc, type, 
> info->shader_blocks,
> +                                     offset);
> +                     break;
> +             case DRM_PANTHOR_PERF_BLOCK_MAX:
> +                     drm_WARN_ON_ONCE(&ptdev->base,
> +                                     "DRM_PANTHOR_PERF_BLOCK_MAX should be 
> unreachable!");
> +                     break;
> +             }
> +     }
> +
> +     /* Computed size is not the same as the reported size, so we should not 
> proceed in
> +      * initializing the sampling session.
> +      */
> +     if (offset != fw_sample_size)
> +             return -EINVAL;
> +
> +     desc->buffer_size = offset;
> +
> +     return 0;
> +}
> +
> +static int panthor_perf_fw_stop_sampling(struct panthor_device *ptdev)
> +{
> +     struct panthor_fw_global_iface *glb_iface = 
> panthor_fw_get_glb_iface(ptdev);
> +     u32 acked;
> +     int ret;
> +
> +     if (~READ_ONCE(glb_iface->input->req) & GLB_PERFCNT_ENABLE)
> +             return 0;
> +
> +     panthor_fw_update_reqs(glb_iface, req, 0, GLB_PERFCNT_ENABLE);
> +     gpu_write(ptdev, CSF_DOORBELL(CSF_GLB_DOORBELL_ID), 1);
> +     ret = panthor_fw_glb_wait_acks(ptdev, GLB_PERFCNT_ENABLE, &acked, 100);
> +     if (ret)
> +             drm_warn(&ptdev->base, "Could not disable performance 
> counters");
> +
> +     return ret;
> +}
> +
> +static int panthor_perf_fw_start_sampling(struct panthor_device *ptdev)
> +{
> +     struct panthor_fw_global_iface *glb_iface = 
> panthor_fw_get_glb_iface(ptdev);
> +     u32 acked;
> +     int ret;
> +
> +     if (READ_ONCE(glb_iface->input->req) & GLB_PERFCNT_ENABLE)
> +             return 0;
> +
> +     panthor_fw_update_reqs(glb_iface, req, GLB_PERFCNT_ENABLE, 
> GLB_PERFCNT_ENABLE);
> +     gpu_write(ptdev, CSF_DOORBELL(CSF_GLB_DOORBELL_ID), 1);
> +     ret = panthor_fw_glb_wait_acks(ptdev, GLB_PERFCNT_ENABLE, &acked, 100);
> +     if (ret)
> +             drm_warn(&ptdev->base, "Could not enable performance counters");
> +
> +     return ret;
> +}
> +
> +static void panthor_perf_fw_write_em(struct panthor_perf_sampler *sampler,
> +             struct panthor_perf_enable_masks *em)
> +{
> +     struct panthor_fw_global_iface *glb_iface = 
> panthor_fw_get_glb_iface(sampler->ptdev);
> +     u32 perfcnt_config;
> +
> +     glb_iface->input->perfcnt_csf_enable =
> +             compress_enable_mask(em->mask[DRM_PANTHOR_PERF_BLOCK_CSHW]);
> +     glb_iface->input->perfcnt_shader_enable =
> +             compress_enable_mask(em->mask[DRM_PANTHOR_PERF_BLOCK_SHADER]);
> +     glb_iface->input->perfcnt_mmu_l2_enable =
> +             compress_enable_mask(em->mask[DRM_PANTHOR_PERF_BLOCK_MEMSYS]);
> +     glb_iface->input->perfcnt_tiler_enable =
> +             compress_enable_mask(em->mask[DRM_PANTHOR_PERF_BLOCK_TILER]);
> +     glb_iface->input->perfcnt_fw_enable =
> +             compress_enable_mask(em->mask[DRM_PANTHOR_PERF_BLOCK_FW]);
> +     glb_iface->input->perfcnt_csg_enable =
> +             compress_enable_mask(em->mask[DRM_PANTHOR_PERF_BLOCK_CSG]);
> +
> +     perfcnt_config = GLB_PRFCNT_CONFIG_SIZE(PANTHOR_PERF_FW_RINGBUF_SLOTS);
> +     perfcnt_config |= GLB_PRFCNT_CONFIG_SET(sampler->set_config);
> +     glb_iface->input->perfcnt_config = perfcnt_config;
> +
> +     /**
> +      * The spec mandates that the host zero the PRFCNT_EXTRACT register 
> before an enable
> +      * operation, and each (re-)enable will require an enable-disable pair 
> to program
> +      * the new changes onto the FW interface.
> +      */
> +     WRITE_ONCE(glb_iface->input->perfcnt_extract, 0);
> +}
> +
> +static void session_populate_sample_header(struct panthor_perf_session 
> *session,
> +             struct drm_panthor_perf_sample_header *hdr)
> +{
> +     hdr->block_set = 0;
> +     hdr->user_data = session->user_data;
> +     hdr->timestamp_start_ns = session->sample_start_ns;
> +     /**
> +      * TODO This should be changed to use the GPU clocks and the TIMESTAMP 
> register,
> +      * when support is added.
> +      */
> +     hdr->timestamp_end_ns = ktime_get_raw_ns();
> +}
> +
> +/**
> + * session_patch_sample - Update the PRFCNT_EN header counter and the 
> counters exposed to the
> + *                        userspace client to only contain requested 
> counters.
> + *
> + * @ptdev: Panthor device
> + * @session: Perf session
> + * @sample: Starting offset of the sample in the userspace mapping.
> + *
> + * The hardware supports counter selection at the granularity of 1 bit per 4 
> counters, and there
> + * is a single global FW frontend to program the counter requests from 
> multiple sessions. This may
> + * lead to a large disparity between the requested and provided counters for 
> an individual client.
> + * To remove this cross-talk, we patch out the counters that have not been 
> requested by this
> + * session and update the PRFCNT_EN, the header counter containing a bitmask 
> of enabled counters,
> + * accordingly.
> + */
> +static void session_patch_sample(struct panthor_device *ptdev,
> +             struct panthor_perf_session *session, u8 *sample)
> +{
> +     const struct drm_panthor_perf_info *const perf_info = &ptdev->perf_info;
> +
> +     const size_t block_size = 
> get_annotated_block_size(perf_info->counters_per_block);
> +     const size_t sample_size = session_get_max_sample_size(perf_info);
> +
> +     for (size_t i = 0; i < sample_size; i += block_size) {
> +             size_t ctr_idx;
> +             DECLARE_BITMAP(em_diff, PANTHOR_PERF_EM_BITS);
> +             struct panthor_perf_counter_block *blk = (typeof(blk))(sample + 
> block_size);
> +             enum drm_panthor_perf_block_type type = blk->header.block_type;
> +             unsigned long *blk_em = session->enabled_counters->mask[type];
> +
> +             bitmap_from_arr64(em_diff, blk->header.enable_mask, 
> PANTHOR_PERF_EM_BITS);
> +
> +             bitmap_andnot(em_diff, em_diff, blk_em, PANTHOR_PERF_EM_BITS);
> +
> +             for_each_set_bit(ctr_idx, em_diff, PANTHOR_PERF_EM_BITS)
> +                     blk->counters[ctr_idx] = 0;
> +
> +             bitmap_to_arr64(blk->header.enable_mask, blk_em, 
> PANTHOR_PERF_EM_BITS);
> +     }
> +}
> +
> +static int session_copy_sample(struct panthor_device *ptdev,
> +             struct panthor_perf_session *session)
> +{
> +     struct panthor_perf *perf = ptdev->perf;
> +     const size_t sample_size = 
> session_get_max_sample_size(&ptdev->perf_info);
> +     const u32 insert_idx = session_read_insert_idx(session);
> +     const u32 extract_idx = session_read_extract_idx(session);
> +     u8 *new_sample;
> +
> +     if (!CIRC_SPACE_TO_END(insert_idx, extract_idx, session->ringbuf_slots))
> +             return -ENOSPC;
> +
> +     new_sample = session->samples + extract_idx * sample_size;

Wouldn't this have to be insert_idx instead? Since we're about to copy into the 
UM
ringbuffer, we should do it at the insert_idx location.

> +
> +     memcpy(new_sample, perf->sampler.sample, sample_size);
> +
> +     session_populate_sample_header(session,
> +                     (struct drm_panthor_perf_sample_header *)new_sample);
> +
> +     session_patch_sample(ptdev, session, new_sample +
> +                     sizeof(struct drm_panthor_perf_sample_header));
> +
> +     session_write_insert_idx(session, (insert_idx + 1) % 
> session->ringbuf_slots);
> +
> +     /* Since we are about to notify userspace, we must ensure that all 
> changes to memory
> +      * are visible.
> +      */
> +     wmb();
> +
> +     eventfd_signal(session->eventfd);
> +
> +     return 0;
> +}
> +
> +#define PERFCNT_IRQS (GLB_PERFCNT_OVERFLOW | GLB_PERFCNT_SAMPLE | 
> GLB_PERFCNT_THRESHOLD)
> +
> +void panthor_perf_report_irq(struct panthor_device *ptdev, u32 status)
> +{
> +     struct panthor_perf *const perf = ptdev->perf;
> +     struct panthor_perf_sampler *sampler;
> +     struct panthor_fw_global_iface *glb_iface = 
> panthor_fw_get_glb_iface(ptdev);
> +
> +     if (!(status & JOB_INT_GLOBAL_IF))
> +             return;
> +
> +     if (!perf)
> +             return;
> +
> +     sampler = &perf->sampler;
> +
> +     /* TODO This needs locking. */
> +     const u32 ack = READ_ONCE(glb_iface->output->ack);
> +     const u32 fw_events = sampler->last_ack ^ ack;
> +
> +     sampler->last_ack = ack;
> +
> +     if (!(fw_events & PERFCNT_IRQS))
> +             return;
> +
> +     /* TODO Fix up the error handling for overflow. */
> +     if (fw_events & GLB_PERFCNT_OVERFLOW)
> +             return;
> +
> +     if (fw_events & (GLB_PERFCNT_SAMPLE | GLB_PERFCNT_THRESHOLD)) {
> +             const u32 extract_idx = 
> READ_ONCE(glb_iface->input->perfcnt_extract);
> +             const u32 insert_idx = 
> READ_ONCE(glb_iface->output->perfcnt_insert);
> +
> +             WRITE_ONCE(glb_iface->input->perfcnt_extract,
> +                             panthor_perf_handle_sample(ptdev, extract_idx, 
> insert_idx));
> +     }
> +
> +     scoped_guard(mutex, &sampler->sampler_lock)
> +     {
> +             struct list_head *pos, *temp;
> +
> +             list_for_each_safe(pos, temp, &sampler->sampler_list) {
> +                     struct panthor_perf_session *session = list_entry(pos,
> +                                     struct panthor_perf_session, waiting);
> +
> +                     session_copy_sample(ptdev, session);
> +                     list_del_init(pos);
> +
> +                     session_put(session);
> +             }
> +     }
> +
> +     memset(sampler->sample, 0, 
> session_get_max_sample_size(&ptdev->perf_info));
> +     sampler->sample_requested = false;
> +     complete(&sampler->sample_handled);
> +}
> +
> +
> +static int panthor_perf_sampler_init(struct panthor_perf_sampler *sampler,
> +             struct panthor_device *ptdev)
> +{
> +     struct panthor_fw_global_iface *glb_iface = 
> panthor_fw_get_glb_iface(ptdev);
> +     struct panthor_kernel_bo *bo;
> +     u8 *sample;
> +     int ret;
> +
> +     ret = panthor_perf_setup_fw_buffer_desc(ptdev, sampler);
> +     if (ret) {
> +             drm_err(&ptdev->base,
> +                             "Failed to setup descriptor for FW ring buffer, 
> err = %d", ret);
> +             return ret;
> +     }
> +
> +     bo = panthor_kernel_bo_create(ptdev, panthor_fw_vm(ptdev),
> +                     sampler->desc.buffer_size * 
> PANTHOR_PERF_FW_RINGBUF_SLOTS,
> +                     DRM_PANTHOR_BO_NO_MMAP,
> +                     DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC | 
> DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED,
> +                     PANTHOR_VM_KERNEL_AUTO_VA);
> +
> +     if (IS_ERR_OR_NULL(bo))
> +             return IS_ERR(bo) ? PTR_ERR(bo) : -ENOMEM;
> +
> +     ret = panthor_kernel_bo_vmap(bo);
> +     if (ret)
> +             goto cleanup_bo;
> +
> +     sample = devm_kzalloc(ptdev->base.dev,
> +                     session_get_max_sample_size(&ptdev->perf_info), 
> GFP_KERNEL);
> +     if (ZERO_OR_NULL_PTR(sample)) {
> +             ret = -ENOMEM;
> +             goto cleanup_vmap;
> +     }
> +
> +     glb_iface->input->perfcnt_as = panthor_vm_as(panthor_fw_vm(ptdev));
> +     glb_iface->input->perfcnt_base = panthor_kernel_bo_gpuva(bo);
> +     glb_iface->input->perfcnt_extract = 0;
> +     glb_iface->input->perfcnt_csg_select = 
> GENMASK(glb_iface->control->group_num, 0);
> +
> +     sampler->rb = bo;
> +     sampler->sample = sample;
> +     sampler->sample_slots = PANTHOR_PERF_FW_RINGBUF_SLOTS;
> +
> +     sampler->em = panthor_perf_em_new();
> +
> +     mutex_init(&sampler->sampler_lock);
> +     mutex_init(&sampler->config_lock);
> +     INIT_LIST_HEAD(&sampler->sampler_list);
> +     INIT_LIST_HEAD(&sampler->ems);
> +     init_completion(&sampler->sample_handled);
> +
> +     sampler->ptdev = ptdev;
> +
> +     return 0;
> +
> +cleanup_vmap:
> +     panthor_kernel_bo_vunmap(bo);
> +
> +cleanup_bo:
> +     panthor_kernel_bo_destroy(bo);
> +
> +     return ret;
> +}
> +
> +static void panthor_perf_sampler_term(struct panthor_perf_sampler *sampler)
> +{
> +     int ret;
> +
> +     if (sampler->sample_requested)
> +             wait_for_completion_killable(&sampler->sample_handled);
> +
> +     panthor_perf_fw_write_em(sampler, &(struct panthor_perf_enable_masks) 
> {});
> +
> +     ret = panthor_perf_fw_stop_sampling(sampler->ptdev);
> +     if (ret)
> +             drm_warn_once(&sampler->ptdev->base, "Sampler termination 
> failed, ret = %d", ret);
> +
> +     devm_kfree(sampler->ptdev->base.dev, sampler->sample);
> +
> +     panthor_kernel_bo_destroy(sampler->rb);
> +}
> +
> +static int panthor_perf_sampler_add(struct panthor_perf_sampler *sampler,
> +             struct panthor_perf_enable_masks *const new_em,
> +             u8 set)
> +{
> +     int ret = 0;
> +
> +     guard(mutex)(&sampler->config_lock);
> +
> +     /* Early check for whether a new set can be configured. */
> +     if (!atomic_read(&sampler->enabled_clients))
> +             sampler->set_config = set;
> +     else
> +             if (sampler->set_config != set)
> +                     return -EBUSY;
> +
> +     kref_get(&new_em->refs);
> +     list_add_tail(&sampler->ems, &new_em->link);
> +
> +     panthor_perf_em_add(sampler->em, new_em);
> +     pm_runtime_get_sync(sampler->ptdev->base.dev);
> +
> +     if (atomic_read(&sampler->enabled_clients)) {
> +             ret = panthor_perf_fw_stop_sampling(sampler->ptdev);
> +             if (ret)
> +                     return ret;
> +     }
> +
> +     panthor_perf_fw_write_em(sampler, sampler->em);
> +
> +     ret = panthor_perf_fw_start_sampling(sampler->ptdev);
> +     if (ret)
> +             return ret;
> +
> +     atomic_inc(&sampler->enabled_clients);
> +
> +     return 0;
> +}
> +
> +static int panthor_perf_sampler_remove(struct panthor_perf_sampler *sampler,
> +             struct panthor_perf_enable_masks *session_em)
> +{
> +     int ret;
> +     struct list_head *em_node;
> +
> +     guard(mutex)(&sampler->config_lock);
> +
> +     list_del_init(&session_em->link);
> +     kref_put(&session_em->refs, panthor_perf_destroy_em_kref);
> +
> +     panthor_perf_em_zero(sampler->em);
> +     list_for_each(em_node, &sampler->ems)
> +     {
> +             struct panthor_perf_enable_masks *curr_em =
> +                     container_of(em_node, typeof(*curr_em), link);
> +
> +             panthor_perf_em_add(sampler->em, curr_em);
> +     }
> +
> +     ret = panthor_perf_fw_stop_sampling(sampler->ptdev);
> +     if (ret)
> +             return ret;
> +
> +     atomic_dec(&sampler->enabled_clients);
> +     pm_runtime_put_sync(sampler->ptdev->base.dev);
> +
> +     panthor_perf_fw_write_em(sampler, sampler->em);
> +
> +     if (atomic_read(&sampler->enabled_clients))
> +             return panthor_perf_fw_start_sampling(sampler->ptdev);
> +     return 0;
> +}
> +
>  /**
>   * panthor_perf_init - Initialize the performance counter subsystem.
>   * @ptdev: Panthor device
> @@ -370,6 +1140,7 @@ static size_t session_get_max_sample_size(const struct 
> drm_panthor_perf_info *co
>  int panthor_perf_init(struct panthor_device *ptdev)
>  {
>       struct panthor_perf *perf;
> +     int ret;
>  
>       if (!ptdev)
>               return -EINVAL;
> @@ -386,12 +1157,93 @@ int panthor_perf_init(struct panthor_device *ptdev)
>               .max = 1,
>       };
>  
> +     ret = panthor_perf_sampler_init(&perf->sampler, ptdev);
> +     if (ret)
> +             goto cleanup_perf;
> +
>       drm_info(&ptdev->base, "Performance counter subsystem initialized");
>  
>       ptdev->perf = perf;
>  
> -     return 0;
> +     return ret;
> +
> +cleanup_perf:
> +     devm_kfree(ptdev->base.dev, perf);
> +
> +     return ret;
> +}
> +
> +
> +static void panthor_perf_fw_request_sample(struct panthor_perf_sampler 
> *sampler)
> +{
> +     struct panthor_fw_global_iface *glb_iface = 
> panthor_fw_get_glb_iface(sampler->ptdev);
> +
> +     panthor_fw_toggle_reqs(glb_iface, req, ack, GLB_PERFCNT_SAMPLE);
> +     gpu_write(sampler->ptdev, CSF_DOORBELL(CSF_GLB_DOORBELL_ID), 1);
> +}
> +
> +/**
> + * panthor_perf_sampler_request_clearing - Request a clearing sample.
> + * @sampler: Panthor sampler
> + *
> + * Perform a synchronous sample that gets immediately discarded. This sets a 
> baseline at the point
> + * of time a new session is started, to avoid having counters from before 
> the session.
> + *
> + */
> +static int panthor_perf_sampler_request_clearing(struct panthor_perf_sampler 
> *sampler)
> +{
> +     scoped_guard(mutex, &sampler->sampler_lock) {
> +             if (!sampler->sample_requested) {
> +                     panthor_perf_fw_request_sample(sampler);
> +                     sampler->sample_requested = true;
> +             }
> +     }
> +
> +     return wait_for_completion_timeout(&sampler->sample_handled,
> +                     msecs_to_jiffies(1000));
> +}
> +
> +/**
> + * panthor_perf_sampler_request_sample - Request a counter sample for the 
> userspace client.
> + * @sampler: Panthor sampler
> + * @session: Target session
> + *
> + * A session that has already requested a sample cannot request another one 
> until the previous
> + * sample has been delivered.
> + *
> + * Return:
> + * * %0       - The sample has been requested successfully.
> + * * %-EBUSY  - The target session has already requested a sample and has 
> not received it yet.
> + */
> +static int panthor_perf_sampler_request_sample(struct panthor_perf_sampler 
> *sampler,
> +             struct panthor_perf_session *session)
> +{
> +     struct list_head *head;
> +
> +     reinit_completion(&sampler->sample_handled);
> +
> +     guard(mutex)(&sampler->sampler_lock);
> +
> +     /*
> +      * If a previous sample has not been handled yet, the session cannot 
> request another
> +      * sample. If this happens too often, the requested sample rate is too 
> high.
> +      */
> +     list_for_each(head, &sampler->sampler_list) {
> +             struct panthor_perf_session *cur_session = list_entry(head,
> +                             typeof(*cur_session), waiting);
> +
> +             if (session == cur_session)
> +                     return -EBUSY;
> +     }
> +
> +     if (list_empty(&sampler->sampler_list) && !sampler->sample_requested)
> +             panthor_perf_fw_request_sample(sampler);
>  
> +     sampler->sample_requested = true;
> +     list_add_tail(&session->waiting, &sampler->sampler_list);
> +     session_get(session);
> +
> +     return 0;
>  }
>  
>  static int session_validate_set(u8 set)
> @@ -483,7 +1335,12 @@ int panthor_perf_session_setup(struct panthor_device 
> *ptdev, struct panthor_perf
>               goto cleanup_eventfd;
>       }
>  
> +     ret = panthor_perf_sampler_add(&perf->sampler, em, 
> setup_args->block_set);
> +     if (ret)
> +             goto cleanup_em;
> +
>       INIT_LIST_HEAD(&session->waiting);
> +
>       session->extract_idx = ctrl_map.vaddr;
>       *session->extract_idx = 0;
>       session->insert_idx = session->extract_idx + 1;
> @@ -507,12 +1364,15 @@ int panthor_perf_session_setup(struct panthor_device 
> *ptdev, struct panthor_perf
>       ret = xa_alloc_cyclic(&perf->sessions, &session_id, session, 
> perf->session_range,
>                       &perf->next_session, GFP_KERNEL);
>       if (ret < 0)
> -             goto cleanup_em;
> +             goto cleanup_sampler_add;
>  
>       kref_init(&session->ref);
>  
>       return session_id;
>  
> +cleanup_sampler_add:
> +     panthor_perf_sampler_remove(&perf->sampler, em);
> +
>  cleanup_em:
>       kref_put(&em->refs, panthor_perf_destroy_em_kref);
>  
> @@ -540,6 +1400,8 @@ int panthor_perf_session_setup(struct panthor_device 
> *ptdev, struct panthor_perf
>  static int session_stop(struct panthor_perf *perf, struct 
> panthor_perf_session *session,
>               u64 user_data)
>  {
> +     int ret;
> +
>       if (!test_bit(PANTHOR_PERF_SESSION_ACTIVE, session->state))
>               return 0;
>  
> @@ -552,6 +1414,10 @@ static int session_stop(struct panthor_perf *perf, 
> struct panthor_perf_session *
>  
>       session->user_data = user_data;
>  
> +     ret = panthor_perf_sampler_request_sample(&perf->sampler, session);
> +     if (ret)
> +             return ret;
> +
>       clear_bit(PANTHOR_PERF_SESSION_ACTIVE, session->state);
>  
>       /* TODO Calls to the FW interface will go here in later patches. */
> @@ -573,8 +1439,7 @@ static int session_start(struct panthor_perf *perf, 
> struct panthor_perf_session
>       if (session->sample_freq_ns)
>               session->user_data = user_data;
>  
> -     /* TODO Calls to the FW interface will go here in later patches. */
> -     return 0;
> +     return panthor_perf_sampler_request_clearing(&perf->sampler);
>  }
>  
>  static int session_sample(struct panthor_perf *perf, struct 
> panthor_perf_session *session,
> @@ -601,15 +1466,16 @@ static int session_sample(struct panthor_perf *perf, 
> struct panthor_perf_session
>       session->sample_start_ns = ktime_get_raw_ns();
>       session->user_data = user_data;
>  
> -     /* TODO Calls to the FW interface will go here in later patches. */
> -     return 0;
> +     return panthor_perf_sampler_request_sample(&perf->sampler, session);
>  }
>  
>  static int session_destroy(struct panthor_perf *perf, struct 
> panthor_perf_session *session)
>  {
> +     int ret = panthor_perf_sampler_remove(&perf->sampler, 
> session->enabled_counters);
> +
>       session_put(session);
>  
> -     return 0;
> +     return ret;
>  }
>  
>  static int session_teardown(struct panthor_perf *perf, struct 
> panthor_perf_session *session)
> @@ -813,6 +1679,8 @@ void panthor_perf_unplug(struct panthor_device *ptdev)
>  
>       xa_destroy(&perf->sessions);
>  
> +     panthor_perf_sampler_term(&perf->sampler);
> +
>       devm_kfree(ptdev->base.dev, ptdev->perf);
>  
>       ptdev->perf = NULL;
> diff --git a/drivers/gpu/drm/panthor/panthor_perf.h 
> b/drivers/gpu/drm/panthor/panthor_perf.h
> index bfef8874068b..3485e4a55e15 100644
> --- a/drivers/gpu/drm/panthor/panthor_perf.h
> +++ b/drivers/gpu/drm/panthor/panthor_perf.h
> @@ -31,4 +31,6 @@ int panthor_perf_session_sample(struct panthor_file *pfile, 
> struct panthor_perf
>               u32 sid, u64 user_data);
>  void panthor_perf_session_destroy(struct panthor_file *pfile, struct 
> panthor_perf *perf);
>  
> +void panthor_perf_report_irq(struct panthor_device *ptdev, u32 status);
> +
>  #endif /* __PANTHOR_PERF_H__ */
> diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
> index 576d3ad46e6d..a29b755d6556 100644
> --- a/include/uapi/drm/panthor_drm.h
> +++ b/include/uapi/drm/panthor_drm.h
> @@ -441,8 +441,11 @@ enum drm_panthor_perf_feat_flags {
>   * enum drm_panthor_perf_block_type - Performance counter supported block 
> types.
>   */
>  enum drm_panthor_perf_block_type {
> +     /** DRM_PANTHOR_PERF_BLOCK_METADATA: Internal use only. */
> +     DRM_PANTHOR_PERF_BLOCK_METADATA = 0,
> +
>       /** @DRM_PANTHOR_PERF_BLOCK_FW: The FW counter block. */
> -     DRM_PANTHOR_PERF_BLOCK_FW = 1,
> +     DRM_PANTHOR_PERF_BLOCK_FW,
>  
>       /** @DRM_PANTHOR_PERF_BLOCK_CSG: A CSG counter block. */
>       DRM_PANTHOR_PERF_BLOCK_CSG,
> -- 
> 2.25.1


Adrian Larumbe

Reply via email to