There should be a CP FW version check that supports these registers. Some comments inline.
On 2026-06-04 13:45, Eric Huang wrote: > since gfx 9.4.3 HW is calculating accumulated activity counter > per-queue in register sdmax_rlcx_utilization_hi/lo, CPFW adds it in > sdma MQD for save/restore, KFD will read it from there. gfx 9.4.2 > will still keep the way to read from memory at rptr+8. > > v2: read dynamic counter directly from utilization register > > Signed-off-by: Eric Huang <[email protected]> > --- > .../drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c | 51 ++++++++++++++++++- > .../drm/amd/amdkfd/kfd_device_queue_manager.c | 23 +++++++-- > drivers/gpu/drm/amd/amdkfd/kfd_process.c | 13 ++++- > .../include/asic_reg/sdma/sdma_4_4_2_offset.h | 4 ++ > .../gpu/drm/amd/include/kgd_kfd_interface.h | 2 + > drivers/gpu/drm/amd/include/v9_structs.h | 4 +- > 6 files changed, 89 insertions(+), 8 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c > index f46c59118304..16bad244c091 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c > @@ -35,6 +35,8 @@ > #include "sdma/sdma_4_4_2_sh_mask.h" > #include <uapi/linux/kfd_ioctl.h> > > +#define SDMA_QUEUES_NUM_PER_ENG 8 > + > static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd) > { > return (struct v9_sdma_mqd *)mqd; > @@ -584,6 +586,52 @@ static uint32_t kgd_v9_4_3_ptl_ctrl(struct amdgpu_device > *adev, > ptl_state, fmt1, fmt2); > } > > +static int kgd_gfx_v9_4_3_hqd_sdma_get_counter(struct amdgpu_device *adev, > + void *mqd, uint64_t *val) > +{ > + struct v9_sdma_mqd *m = get_sdma_mqd(mqd); > + uint32_t sdma_rlc_reg_offset; > + uint32_t sdma_rlc_rb_cntl; > + uint32_t engine_id, queue_id; > + uint32_t engines = adev->sdma.num_instances; > + uint32_t sdma_rlcx_rb_base, sdma_rlcx_rb_base_hi; > + bool found = false; > + > + if (!m) > + return -EINVAL; > + > + for (engine_id = 0; engine_id < engines && !found; engine_id++) { > + for (queue_id = 0; queue_id < SDMA_QUEUES_NUM_PER_ENG; > queue_id++) { > + sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, > + engine_id, queue_id); [HK]: sdma_rlc_reg_offset read could be move inside the if condition. Saves unncessary register read here. > + sdma_rlcx_rb_base = RREG32(sdma_rlc_reg_offset + > + regSDMA_RLC0_RB_BASE); > + sdma_rlcx_rb_base_hi = RREG32(sdma_rlc_reg_offset + > + regSDMA_RLC0_RB_BASE_HI); > + > + if (m->sdmax_rlcx_rb_base == sdma_rlcx_rb_base && > + m->sdmax_rlcx_rb_base_hi == sdma_rlcx_rb_base_hi) { > + found = true; > + break; > + } > + } > + } > + [HK]: Needs a if(!found) error handling. > + sdma_rlc_rb_cntl = RREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_CNTL); > + > + /* Read sdma activity counter from utilization register > + * if hw queue is enabled, otherwise read from MQD. > + */ > + if (sdma_rlc_rb_cntl & SDMA_RLC0_RB_CNTL__RB_ENABLE_MASK) > + *val = (uint64_t)RREG32(sdma_rlc_reg_offset + > regSDMA_RLC0_UTILIZATION_HI) << 32 | > + RREG32(sdma_rlc_reg_offset + > regSDMA_RLC0_UTILIZATION_LO); > + else > + *val = (uint64_t)m->sdmax_rlcx_utilization_hi << 32 | > + m->sdmax_rlcx_utilization_lo; > + > + return 0; > +} > + > const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = { > .program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings, > .set_pasid_vmid_mapping = kgd_gfx_v9_4_3_set_pasid_vmid_mapping, > @@ -623,5 +671,6 @@ const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = { > .trigger_pc_sample_trap = kgd_v9_4_3_trigger_pc_sample_trap, > .override_core_cg = kgd_gfx_v9_4_3_override_core_cg, > .setup_stoch_sampling = kgd_v9_4_3_setup_stoch_sampling, > - .ptl_ctrl = kgd_v9_4_3_ptl_ctrl > + .ptl_ctrl = kgd_v9_4_3_ptl_ctrl, > + .hqd_sdma_get_counter = kgd_gfx_v9_4_3_hqd_sdma_get_counter > }; > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > index b934863312d0..a65161659f74 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > @@ -1067,8 +1067,15 @@ static int destroy_queue_nocpsch(struct > device_queue_manager *dqm, > /* Get the SDMA queue stats */ > if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) || > (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) { > - retval = read_sdma_queue_counter((uint64_t __user > *)q->properties.read_ptr, > - &sdma_val); > + if ((KFD_GC_VERSION(dqm->dev) <= IP_VERSION(9, 4, 2))) > + retval = read_sdma_queue_counter( > + (uint64_t __user > *)q->properties.read_ptr, > + &sdma_val); > + else > + retval = dqm->dev->kfd2kgd->hqd_sdma_get_counter ? > + dqm->dev->kfd2kgd->hqd_sdma_get_counter( > + dqm->dev->adev, q->mqd, &sdma_val) : > + 0; [HK]: What ahout 9.4.4 and 9.5? Do we we support those now? Otherwise, it will silently report 0 and not error. > if (retval) > dev_err(dev, "Failed to read SDMA queue counter for > queue: %d\n", > q->properties.queue_id); > @@ -2728,8 +2735,16 @@ static int destroy_queue_cpsch(struct > device_queue_manager *dqm, > /* Get the SDMA queue stats */ > if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) || > (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) { > - retval = read_sdma_queue_counter((uint64_t __user > *)q->properties.read_ptr, > - &sdma_val); > + if (KFD_GC_VERSION(dqm->dev) <= IP_VERSION(9, 4, 2)) > + retval = read_sdma_queue_counter( > + (uint64_t __user > *)q->properties.read_ptr, > + &sdma_val); > + else > + retval = dqm->dev->kfd2kgd->hqd_sdma_get_counter ? > + dqm->dev->kfd2kgd->hqd_sdma_get_counter( > + dqm->dev->adev, q->mqd, &sdma_val) : > + 0; > + > if (retval) > dev_err(dev, "Failed to read SDMA queue counter for > queue: %d\n", > q->properties.queue_id); > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c > b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > index 0be2fd04e6d0..911f974e6bf5 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > @@ -95,6 +95,7 @@ struct kfd_sdma_activity_handler_workarea { > > struct temp_sdma_queue_list { > uint64_t __user *rptr; > + void *mqd; > uint64_t sdma_val; > unsigned int queue_id; > struct list_head list; > @@ -165,6 +166,7 @@ static void kfd_sdma_activity_worker(struct work_struct > *work) > > INIT_LIST_HEAD(&sdma_q->list); > sdma_q->rptr = (uint64_t __user *)q->properties.read_ptr; > + sdma_q->mqd = q->mqd; > sdma_q->queue_id = q->properties.queue_id; > list_add_tail(&sdma_q->list, &sdma_q_list.list); > } > @@ -193,7 +195,16 @@ static void kfd_sdma_activity_worker(struct work_struct > *work) > > list_for_each_entry(sdma_q, &sdma_q_list.list, list) { > val = 0; > - ret = read_sdma_queue_counter(sdma_q->rptr, &val); > + > + if ((KFD_GC_VERSION(dqm->dev) <= IP_VERSION(9, 4, 2))) > + ret = read_sdma_queue_counter(sdma_q->rptr, &val); > + else > + ret = dqm->dev->kfd2kgd->hqd_sdma_get_counter ? > + dqm->dev->kfd2kgd->hqd_sdma_get_counter( > + dqm->dev->adev, > + sdma_q->mqd, &val) : > + 0; > + > if (ret) { > pr_debug("Failed to read SDMA queue active counter for > queue id: %d", > sdma_q->queue_id); > diff --git a/drivers/gpu/drm/amd/include/asic_reg/sdma/sdma_4_4_2_offset.h > b/drivers/gpu/drm/amd/include/asic_reg/sdma/sdma_4_4_2_offset.h > index ead81aeffd67..8700f8190c7c 100644 > --- a/drivers/gpu/drm/amd/include/asic_reg/sdma/sdma_4_4_2_offset.h > +++ b/drivers/gpu/drm/amd/include/asic_reg/sdma/sdma_4_4_2_offset.h > @@ -493,6 +493,10 @@ > #define regSDMA_RLC0_MIDCMD_DATA10_BASE_IDX > 0 > #define regSDMA_RLC0_MIDCMD_CNTL > 0x017b > #define regSDMA_RLC0_MIDCMD_CNTL_BASE_IDX > 0 > +#define regSDMA_RLC0_UTILIZATION_LO > 0x017c > +#define regSDMA_RLC0_UTILIZATION_LO_BASE_IDX > 0 > +#define regSDMA_RLC0_UTILIZATION_HI > 0x017d > +#define regSDMA_RLC0_UTILIZATION_HI_BASE_IDX > 0 > #define regSDMA_RLC1_RB_CNTL > 0x018 [HK]: Formatting error. You need to use space instead of tabs, I guess. 8 > #define regSDMA_RLC1_RB_CNTL_BASE_IDX > 0 > #define regSDMA_RLC1_RB_BASE > 0x0189 > diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h > b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h > index d34c869b182f..f3220794c108 100644 > --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h > +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h > @@ -361,6 +361,8 @@ struct kfd2kgd_calls { > uint32_t *ptl_state, > enum amdgpu_ptl_fmt *fmt1, > enum amdgpu_ptl_fmt *fmt2); > + int (*hqd_sdma_get_counter)(struct amdgpu_device *adev, > + void *mqd, uint64_t *val); [HK]: #define SDMA_QUEUES_NUM_PER_ENG 8 <-- We already hard code this value in KFD. I think #define could be avoided if you pass in number_of_sdma_queues_per_engine as a parameter. > }; > > #endif /* KGD_KFD_INTERFACE_H_INCLUDED */ > diff --git a/drivers/gpu/drm/amd/include/v9_structs.h > b/drivers/gpu/drm/amd/include/v9_structs.h > index a2f81b9c38af..e0d387f08576 100644 > --- a/drivers/gpu/drm/amd/include/v9_structs.h > +++ b/drivers/gpu/drm/amd/include/v9_structs.h > @@ -69,8 +69,8 @@ struct v9_sdma_mqd { > uint32_t sdmax_rlcx_midcmd_cntl; > uint32_t reserved_42; > uint32_t reserved_43; > - uint32_t reserved_44; > - uint32_t reserved_45; > + uint32_t sdmax_rlcx_utilization_lo; > + uint32_t sdmax_rlcx_utilization_hi; > uint32_t reserved_46; > uint32_t reserved_47; > uint32_t reserved_48;
