On 2026-06-09 14:14, Eric Huang wrote:
>
> On 2026-06-09 12:43, Harish Kasiviswanathan wrote:
>> There should be a CP FW version check that supports these registers.
> The only difference for CPFW supporting save/restore the registers is no
> accumulated value per-queue after unmap/remap queue, the utilization register
> always updates current value+activity counter by HW, so it is not fully
> necessary in my point of view. If the check is needed, it won't return
> anything wrong, and just leaves a warning like "No accumulated counter
> supported!", what do you think?
[HK]: I think printing values that look correct but is wrong is very
misleading. If CP FW doesn't support it, sdma_utilization should report either
0 or -1 along with a message in the kernel log. This should be case for 9.4.4
and 9.5 until CP FW fixes for these ASICs. Apart from that everything else
looks fine.
>> Some comments inline.
>>
>>
>> On 2026-06-04 13:45, Eric Huang wrote:
>>> since gfx 9.4.3 HW is calculating accumulated activity counter
>>> per-queue in register sdmax_rlcx_utilization_hi/lo, CPFW adds it in
>>> sdma MQD for save/restore, KFD will read it from there. gfx 9.4.2
>>> will still keep the way to read from memory at rptr+8.
>>>
>>> v2: read dynamic counter directly from utilization register
>>>
>>> Signed-off-by: Eric Huang <[email protected]>
>>> ---
>>> .../drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c | 51 ++++++++++++++++++-
>>> .../drm/amd/amdkfd/kfd_device_queue_manager.c | 23 +++++++--
>>> drivers/gpu/drm/amd/amdkfd/kfd_process.c | 13 ++++-
>>> .../include/asic_reg/sdma/sdma_4_4_2_offset.h | 4 ++
>>> .../gpu/drm/amd/include/kgd_kfd_interface.h | 2 +
>>> drivers/gpu/drm/amd/include/v9_structs.h | 4 +-
>>> 6 files changed, 89 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
>>> index f46c59118304..16bad244c091 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
>>> @@ -35,6 +35,8 @@
>>> #include "sdma/sdma_4_4_2_sh_mask.h"
>>> #include <uapi/linux/kfd_ioctl.h>
>>> +#define SDMA_QUEUES_NUM_PER_ENG 8
>>> +
>>> static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd)
>>> {
>>> return (struct v9_sdma_mqd *)mqd;
>>> @@ -584,6 +586,52 @@ static uint32_t kgd_v9_4_3_ptl_ctrl(struct
>>> amdgpu_device *adev,
>>> ptl_state, fmt1, fmt2);
>>> }
>>> +static int kgd_gfx_v9_4_3_hqd_sdma_get_counter(struct amdgpu_device
>>> *adev,
>>> + void *mqd, uint64_t *val)
>>> +{
>>> + struct v9_sdma_mqd *m = get_sdma_mqd(mqd);
>>> + uint32_t sdma_rlc_reg_offset;
>>> + uint32_t sdma_rlc_rb_cntl;
>>> + uint32_t engine_id, queue_id;
>>> + uint32_t engines = adev->sdma.num_instances;
>>> + uint32_t sdma_rlcx_rb_base, sdma_rlcx_rb_base_hi;
>>> + bool found = false;
>>> +
>>> + if (!m)
>>> + return -EINVAL;
>>> +
>>> + for (engine_id = 0; engine_id < engines && !found; engine_id++) {
>>> + for (queue_id = 0; queue_id < SDMA_QUEUES_NUM_PER_ENG; queue_id++)
>>> {
>>> + sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev,
>>> + engine_id, queue_id);
>> [HK]: sdma_rlc_reg_offset read could be move inside the if condition. Saves
>> unncessary register read here.
> It won't work, sdma_rlc_reg_offset is needed in the next line for reading
> sdm_rlcx_rb_base from register.
>>
>>> + sdma_rlcx_rb_base = RREG32(sdma_rlc_reg_offset +
>>> + regSDMA_RLC0_RB_BASE);
>>> + sdma_rlcx_rb_base_hi = RREG32(sdma_rlc_reg_offset +
>>> + regSDMA_RLC0_RB_BASE_HI);
>>> +
>>> + if (m->sdmax_rlcx_rb_base == sdma_rlcx_rb_base &&
>>> + m->sdmax_rlcx_rb_base_hi == sdma_rlcx_rb_base_hi) {
>>> + found = true;
>>> + break;
>>> + }
>>> + }
>>> + }
>>> +
>> [HK]: Needs a if(!found) error handling.
> Because SDMA doesn't support oversubscription, there must be a HQD associated
> with a MQD, found must be true here. I probably need to add the comment in
> finding loop for that.
>>
>>
>>> + sdma_rlc_rb_cntl = RREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_CNTL);
>>> +
>>> + /* Read sdma activity counter from utilization register
>>> + * if hw queue is enabled, otherwise read from MQD.
>>> + */
>>> + if (sdma_rlc_rb_cntl & SDMA_RLC0_RB_CNTL__RB_ENABLE_MASK)
>>> + *val = (uint64_t)RREG32(sdma_rlc_reg_offset +
>>> regSDMA_RLC0_UTILIZATION_HI) << 32 |
>>> + RREG32(sdma_rlc_reg_offset + regSDMA_RLC0_UTILIZATION_LO);
>>> + else
>>> + *val = (uint64_t)m->sdmax_rlcx_utilization_hi << 32 |
>>> + m->sdmax_rlcx_utilization_lo;
>>> +
>>> + return 0;
>>> +}
>>> +
>>> const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = {
>>> .program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
>>> .set_pasid_vmid_mapping = kgd_gfx_v9_4_3_set_pasid_vmid_mapping,
>>> @@ -623,5 +671,6 @@ const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = {
>>> .trigger_pc_sample_trap = kgd_v9_4_3_trigger_pc_sample_trap,
>>> .override_core_cg = kgd_gfx_v9_4_3_override_core_cg,
>>> .setup_stoch_sampling = kgd_v9_4_3_setup_stoch_sampling,
>>> - .ptl_ctrl = kgd_v9_4_3_ptl_ctrl
>>> + .ptl_ctrl = kgd_v9_4_3_ptl_ctrl,
>>> + .hqd_sdma_get_counter = kgd_gfx_v9_4_3_hqd_sdma_get_counter
>>> };
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> index b934863312d0..a65161659f74 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> @@ -1067,8 +1067,15 @@ static int destroy_queue_nocpsch(struct
>>> device_queue_manager *dqm,
>>> /* Get the SDMA queue stats */
>>> if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
>>> (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
>>> - retval = read_sdma_queue_counter((uint64_t __user
>>> *)q->properties.read_ptr,
>>> - &sdma_val);
>>> + if ((KFD_GC_VERSION(dqm->dev) <= IP_VERSION(9, 4, 2)))
>>> + retval = read_sdma_queue_counter(
>>> + (uint64_t __user *)q->properties.read_ptr,
>>> + &sdma_val);
>>> + else
>>> + retval = dqm->dev->kfd2kgd->hqd_sdma_get_counter ?
>>> + dqm->dev->kfd2kgd->hqd_sdma_get_counter(
>>> + dqm->dev->adev, q->mqd, &sdma_val) :
>>> + 0;
>> [HK]: What ahout 9.4.4 and 9.5? Do we we support those now? Otherwise, it
>> will silently report 0 and not error.
> gc_9_4_3_kfd2kgd funcs are shared with gfx 9.4.4 and 9.5. The only thing I
> think is return -ENOTSUP instead of 0, if no function is found here. what do
> you think?
>>
>>> if (retval)
>>> dev_err(dev, "Failed to read SDMA queue counter for queue:
>>> %d\n",
>>> q->properties.queue_id);
>>> @@ -2728,8 +2735,16 @@ static int destroy_queue_cpsch(struct
>>> device_queue_manager *dqm,
>>> /* Get the SDMA queue stats */
>>> if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
>>> (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
>>> - retval = read_sdma_queue_counter((uint64_t __user
>>> *)q->properties.read_ptr,
>>> - &sdma_val);
>>> + if (KFD_GC_VERSION(dqm->dev) <= IP_VERSION(9, 4, 2))
>>> + retval = read_sdma_queue_counter(
>>> + (uint64_t __user *)q->properties.read_ptr,
>>> + &sdma_val);
>>> + else
>>> + retval = dqm->dev->kfd2kgd->hqd_sdma_get_counter ?
>>> + dqm->dev->kfd2kgd->hqd_sdma_get_counter(
>>> + dqm->dev->adev, q->mqd, &sdma_val) :
>>> + 0;
>>> +
>>> if (retval)
>>> dev_err(dev, "Failed to read SDMA queue counter for queue:
>>> %d\n",
>>> q->properties.queue_id);
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> index 0be2fd04e6d0..911f974e6bf5 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> @@ -95,6 +95,7 @@ struct kfd_sdma_activity_handler_workarea {
>>> struct temp_sdma_queue_list {
>>> uint64_t __user *rptr;
>>> + void *mqd;
>>> uint64_t sdma_val;
>>> unsigned int queue_id;
>>> struct list_head list;
>>> @@ -165,6 +166,7 @@ static void kfd_sdma_activity_worker(struct work_struct
>>> *work)
>>> INIT_LIST_HEAD(&sdma_q->list);
>>> sdma_q->rptr = (uint64_t __user *)q->properties.read_ptr;
>>> + sdma_q->mqd = q->mqd;
>>> sdma_q->queue_id = q->properties.queue_id;
>>> list_add_tail(&sdma_q->list, &sdma_q_list.list);
>>> }
>>> @@ -193,7 +195,16 @@ static void kfd_sdma_activity_worker(struct
>>> work_struct *work)
>>> list_for_each_entry(sdma_q, &sdma_q_list.list, list) {
>>> val = 0;
>>> - ret = read_sdma_queue_counter(sdma_q->rptr, &val);
>>> +
>>> + if ((KFD_GC_VERSION(dqm->dev) <= IP_VERSION(9, 4, 2)))
>>> + ret = read_sdma_queue_counter(sdma_q->rptr, &val);
>>> + else
>>> + ret = dqm->dev->kfd2kgd->hqd_sdma_get_counter ?
>>> + dqm->dev->kfd2kgd->hqd_sdma_get_counter(
>>> + dqm->dev->adev,
>>> + sdma_q->mqd, &val) :
>>> + 0;
>>> +
>>> if (ret) {
>>> pr_debug("Failed to read SDMA queue active counter for queue
>>> id: %d",
>>> sdma_q->queue_id);
>>> diff --git a/drivers/gpu/drm/amd/include/asic_reg/sdma/sdma_4_4_2_offset.h
>>> b/drivers/gpu/drm/amd/include/asic_reg/sdma/sdma_4_4_2_offset.h
>>> index ead81aeffd67..8700f8190c7c 100644
>>> --- a/drivers/gpu/drm/amd/include/asic_reg/sdma/sdma_4_4_2_offset.h
>>> +++ b/drivers/gpu/drm/amd/include/asic_reg/sdma/sdma_4_4_2_offset.h
>>> @@ -493,6 +493,10 @@
>>> #define regSDMA_RLC0_MIDCMD_DATA10_BASE_IDX
>>> 0
>>> #define regSDMA_RLC0_MIDCMD_CNTL
>>> 0x017b
>>> #define regSDMA_RLC0_MIDCMD_CNTL_BASE_IDX
>>> 0
>>> +#define regSDMA_RLC0_UTILIZATION_LO
>>> 0x017c
>>> +#define regSDMA_RLC0_UTILIZATION_LO_BASE_IDX
>>> 0
>>> +#define regSDMA_RLC0_UTILIZATION_HI
>>> 0x017d
>>> +#define regSDMA_RLC0_UTILIZATION_HI_BASE_IDX
>>> 0
>>> #define regSDMA_RLC1_RB_CNTL
>>> 0x018
>> [HK]: Formatting error. You need to use space instead of tabs, I guess.
> Sure.
>>
>>
>> 8
>>> #define regSDMA_RLC1_RB_CNTL_BASE_IDX
>>> 0
>>> #define regSDMA_RLC1_RB_BASE
>>> 0x0189
>>> diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
>>> b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
>>> index d34c869b182f..f3220794c108 100644
>>> --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
>>> +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
>>> @@ -361,6 +361,8 @@ struct kfd2kgd_calls {
>>> uint32_t *ptl_state,
>>> enum amdgpu_ptl_fmt *fmt1,
>>> enum amdgpu_ptl_fmt *fmt2);
>>> + int (*hqd_sdma_get_counter)(struct amdgpu_device *adev,
>>> + void *mqd, uint64_t *val);
>> [HK]: #define SDMA_QUEUES_NUM_PER_ENG 8 <-- We already hard code this
>> value in KFD. I think #define could be avoided if you pass in
>> number_of_sdma_queues_per_engine as a parameter.
> OK.
>
> Regards,
> Eric
>>
>>
>>> };
>>> #endif /* KGD_KFD_INTERFACE_H_INCLUDED */
>>> diff --git a/drivers/gpu/drm/amd/include/v9_structs.h
>>> b/drivers/gpu/drm/amd/include/v9_structs.h
>>> index a2f81b9c38af..e0d387f08576 100644
>>> --- a/drivers/gpu/drm/amd/include/v9_structs.h
>>> +++ b/drivers/gpu/drm/amd/include/v9_structs.h
>>> @@ -69,8 +69,8 @@ struct v9_sdma_mqd {
>>> uint32_t sdmax_rlcx_midcmd_cntl;
>>> uint32_t reserved_42;
>>> uint32_t reserved_43;
>>> - uint32_t reserved_44;
>>> - uint32_t reserved_45;
>>> + uint32_t sdmax_rlcx_utilization_lo;
>>> + uint32_t sdmax_rlcx_utilization_hi;
>>> uint32_t reserved_46;
>>> uint32_t reserved_47;
>>> uint32_t reserved_48;
>