There should be a CP FW version check that supports these registers.
Some comments inline.


On 2026-06-04 13:45, Eric Huang wrote:
> since gfx 9.4.3 HW is calculating accumulated activity counter
> per-queue in register sdmax_rlcx_utilization_hi/lo, CPFW adds it in
> sdma MQD for save/restore, KFD will read it from there. gfx 9.4.2
> will still keep the way to read from memory at rptr+8.
> 
> v2: read dynamic counter directly from utilization register
> 
> Signed-off-by: Eric Huang <[email protected]>
> ---
>  .../drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c   | 51 ++++++++++++++++++-
>  .../drm/amd/amdkfd/kfd_device_queue_manager.c | 23 +++++++--
>  drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 13 ++++-
>  .../include/asic_reg/sdma/sdma_4_4_2_offset.h |  4 ++
>  .../gpu/drm/amd/include/kgd_kfd_interface.h   |  2 +
>  drivers/gpu/drm/amd/include/v9_structs.h      |  4 +-
>  6 files changed, 89 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
> index f46c59118304..16bad244c091 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
> @@ -35,6 +35,8 @@
>  #include "sdma/sdma_4_4_2_sh_mask.h"
>  #include <uapi/linux/kfd_ioctl.h>
>  
> +#define SDMA_QUEUES_NUM_PER_ENG      8
> +
>  static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd)
>  {
>       return (struct v9_sdma_mqd *)mqd;
> @@ -584,6 +586,52 @@ static uint32_t kgd_v9_4_3_ptl_ctrl(struct amdgpu_device 
> *adev,
>                       ptl_state, fmt1, fmt2);
>  }
>  
> +static int kgd_gfx_v9_4_3_hqd_sdma_get_counter(struct amdgpu_device *adev,
> +                                     void *mqd, uint64_t *val)
> +{
> +     struct v9_sdma_mqd *m = get_sdma_mqd(mqd);
> +     uint32_t sdma_rlc_reg_offset;
> +     uint32_t sdma_rlc_rb_cntl;
> +     uint32_t engine_id, queue_id;
> +     uint32_t engines = adev->sdma.num_instances;
> +     uint32_t sdma_rlcx_rb_base, sdma_rlcx_rb_base_hi;
> +     bool found = false;
> +
> +     if (!m)
> +             return -EINVAL;
> +
> +     for (engine_id = 0; engine_id < engines && !found; engine_id++) {
> +             for (queue_id = 0; queue_id < SDMA_QUEUES_NUM_PER_ENG; 
> queue_id++) {
> +                     sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev,
> +                                             engine_id, queue_id);
[HK]: sdma_rlc_reg_offset read could be move inside the if condition. Saves 
unncessary register read here.

> +                     sdma_rlcx_rb_base = RREG32(sdma_rlc_reg_offset +
> +                                             regSDMA_RLC0_RB_BASE);
> +                     sdma_rlcx_rb_base_hi = RREG32(sdma_rlc_reg_offset +
> +                                             regSDMA_RLC0_RB_BASE_HI);
> +
> +                     if (m->sdmax_rlcx_rb_base == sdma_rlcx_rb_base &&
> +                         m->sdmax_rlcx_rb_base_hi == sdma_rlcx_rb_base_hi) {
> +                             found = true;
> +                             break;
> +                     }
> +             }
> +     }
> +

[HK]: Needs a if(!found) error handling.


> +     sdma_rlc_rb_cntl = RREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_CNTL);
> +
> +     /* Read sdma activity counter from utilization register
> +      * if hw queue is enabled, otherwise read from MQD.
> +      */
> +     if (sdma_rlc_rb_cntl & SDMA_RLC0_RB_CNTL__RB_ENABLE_MASK)
> +             *val = (uint64_t)RREG32(sdma_rlc_reg_offset + 
> regSDMA_RLC0_UTILIZATION_HI) << 32 |
> +                     RREG32(sdma_rlc_reg_offset + 
> regSDMA_RLC0_UTILIZATION_LO);
> +     else
> +             *val = (uint64_t)m->sdmax_rlcx_utilization_hi << 32 |
> +                     m->sdmax_rlcx_utilization_lo;
> +
> +     return 0;
> +}
> +
>  const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = {
>       .program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
>       .set_pasid_vmid_mapping = kgd_gfx_v9_4_3_set_pasid_vmid_mapping,
> @@ -623,5 +671,6 @@ const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = {
>       .trigger_pc_sample_trap = kgd_v9_4_3_trigger_pc_sample_trap,
>       .override_core_cg = kgd_gfx_v9_4_3_override_core_cg,
>       .setup_stoch_sampling = kgd_v9_4_3_setup_stoch_sampling,
> -     .ptl_ctrl = kgd_v9_4_3_ptl_ctrl
> +     .ptl_ctrl = kgd_v9_4_3_ptl_ctrl,
> +     .hqd_sdma_get_counter = kgd_gfx_v9_4_3_hqd_sdma_get_counter
>  };
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index b934863312d0..a65161659f74 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -1067,8 +1067,15 @@ static int destroy_queue_nocpsch(struct 
> device_queue_manager *dqm,
>       /* Get the SDMA queue stats */
>       if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
>           (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
> -             retval = read_sdma_queue_counter((uint64_t __user 
> *)q->properties.read_ptr,
> -                                                     &sdma_val);
> +             if ((KFD_GC_VERSION(dqm->dev) <= IP_VERSION(9, 4, 2)))
> +                     retval = read_sdma_queue_counter(
> +                                     (uint64_t __user 
> *)q->properties.read_ptr,
> +                                     &sdma_val);
> +             else
> +                     retval = dqm->dev->kfd2kgd->hqd_sdma_get_counter ?
> +                              dqm->dev->kfd2kgd->hqd_sdma_get_counter(
> +                                     dqm->dev->adev, q->mqd, &sdma_val) :
> +                              0;
[HK]: What ahout 9.4.4 and 9.5? Do we we support those now? Otherwise, it will 
silently report 0 and not error. 

>               if (retval)
>                       dev_err(dev, "Failed to read SDMA queue counter for 
> queue: %d\n",
>                               q->properties.queue_id);
> @@ -2728,8 +2735,16 @@ static int destroy_queue_cpsch(struct 
> device_queue_manager *dqm,
>       /* Get the SDMA queue stats */
>       if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
>           (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
> -             retval = read_sdma_queue_counter((uint64_t __user 
> *)q->properties.read_ptr,
> -                                                     &sdma_val);
> +             if (KFD_GC_VERSION(dqm->dev) <= IP_VERSION(9, 4, 2))
> +                     retval = read_sdma_queue_counter(
> +                                     (uint64_t __user 
> *)q->properties.read_ptr,
> +                                     &sdma_val);
> +             else
> +                     retval = dqm->dev->kfd2kgd->hqd_sdma_get_counter ?
> +                              dqm->dev->kfd2kgd->hqd_sdma_get_counter(
> +                                     dqm->dev->adev, q->mqd, &sdma_val) :
> +                              0;
> +
>               if (retval)
>                       dev_err(dev, "Failed to read SDMA queue counter for 
> queue: %d\n",
>                               q->properties.queue_id);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 0be2fd04e6d0..911f974e6bf5 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -95,6 +95,7 @@ struct kfd_sdma_activity_handler_workarea {
>  
>  struct temp_sdma_queue_list {
>       uint64_t __user *rptr;
> +     void *mqd;
>       uint64_t sdma_val;
>       unsigned int queue_id;
>       struct list_head list;
> @@ -165,6 +166,7 @@ static void kfd_sdma_activity_worker(struct work_struct 
> *work)
>  
>               INIT_LIST_HEAD(&sdma_q->list);
>               sdma_q->rptr = (uint64_t __user *)q->properties.read_ptr;
> +             sdma_q->mqd = q->mqd;
>               sdma_q->queue_id = q->properties.queue_id;
>               list_add_tail(&sdma_q->list, &sdma_q_list.list);
>       }
> @@ -193,7 +195,16 @@ static void kfd_sdma_activity_worker(struct work_struct 
> *work)
>  
>       list_for_each_entry(sdma_q, &sdma_q_list.list, list) {
>               val = 0;
> -             ret = read_sdma_queue_counter(sdma_q->rptr, &val);
> +
> +             if ((KFD_GC_VERSION(dqm->dev) <= IP_VERSION(9, 4, 2)))
> +                     ret = read_sdma_queue_counter(sdma_q->rptr, &val);
> +             else
> +                     ret = dqm->dev->kfd2kgd->hqd_sdma_get_counter ?
> +                           dqm->dev->kfd2kgd->hqd_sdma_get_counter(
> +                                     dqm->dev->adev,
> +                                     sdma_q->mqd, &val) :
> +                           0;
> +
>               if (ret) {
>                       pr_debug("Failed to read SDMA queue active counter for 
> queue id: %d",
>                                sdma_q->queue_id);
> diff --git a/drivers/gpu/drm/amd/include/asic_reg/sdma/sdma_4_4_2_offset.h 
> b/drivers/gpu/drm/amd/include/asic_reg/sdma/sdma_4_4_2_offset.h
> index ead81aeffd67..8700f8190c7c 100644
> --- a/drivers/gpu/drm/amd/include/asic_reg/sdma/sdma_4_4_2_offset.h
> +++ b/drivers/gpu/drm/amd/include/asic_reg/sdma/sdma_4_4_2_offset.h
> @@ -493,6 +493,10 @@
>  #define regSDMA_RLC0_MIDCMD_DATA10_BASE_IDX                                  
>                            0
>  #define regSDMA_RLC0_MIDCMD_CNTL                                             
>                            0x017b
>  #define regSDMA_RLC0_MIDCMD_CNTL_BASE_IDX                                    
>                            0
> +#define regSDMA_RLC0_UTILIZATION_LO                                          
>                         0x017c
> +#define regSDMA_RLC0_UTILIZATION_LO_BASE_IDX                                 
>                         0
> +#define regSDMA_RLC0_UTILIZATION_HI                                          
>                         0x017d
> +#define regSDMA_RLC0_UTILIZATION_HI_BASE_IDX                                 
>                         0
>  #define regSDMA_RLC1_RB_CNTL                                                 
>                            0x018
[HK]: Formatting error. You need to use space instead of tabs, I guess. 


8
>  #define regSDMA_RLC1_RB_CNTL_BASE_IDX                                        
>                            0
>  #define regSDMA_RLC1_RB_BASE                                                 
>                            0x0189
> diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h 
> b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> index d34c869b182f..f3220794c108 100644
> --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> @@ -361,6 +361,8 @@ struct kfd2kgd_calls {
>                            uint32_t *ptl_state,
>                            enum amdgpu_ptl_fmt *fmt1,
>                            enum amdgpu_ptl_fmt *fmt2);
> +     int (*hqd_sdma_get_counter)(struct amdgpu_device *adev,
> +                                 void *mqd, uint64_t *val);
[HK]: #define SDMA_QUEUES_NUM_PER_ENG   8 <-- We already hard code this value 
in KFD. I think #define could be avoided if you pass in 
number_of_sdma_queues_per_engine as a parameter.


>  };
>  
>  #endif       /* KGD_KFD_INTERFACE_H_INCLUDED */
> diff --git a/drivers/gpu/drm/amd/include/v9_structs.h 
> b/drivers/gpu/drm/amd/include/v9_structs.h
> index a2f81b9c38af..e0d387f08576 100644
> --- a/drivers/gpu/drm/amd/include/v9_structs.h
> +++ b/drivers/gpu/drm/amd/include/v9_structs.h
> @@ -69,8 +69,8 @@ struct v9_sdma_mqd {
>       uint32_t sdmax_rlcx_midcmd_cntl;
>       uint32_t reserved_42;
>       uint32_t reserved_43;
> -     uint32_t reserved_44;
> -     uint32_t reserved_45;
> +     uint32_t sdmax_rlcx_utilization_lo;
> +     uint32_t sdmax_rlcx_utilization_hi;
>       uint32_t reserved_46;
>       uint32_t reserved_47;
>       uint32_t reserved_48;

Reply via email to