On 2026-06-09 14:14, Eric Huang wrote:
> 
> On 2026-06-09 12:43, Harish Kasiviswanathan wrote:
>> There should be a CP FW version check that supports these registers.
> The only difference for CPFW supporting save/restore the registers is no 
> accumulated value per-queue after unmap/remap queue, the utilization register 
>  always updates current value+activity counter by HW, so it is not fully 
> necessary in my point of view. If the check is needed, it won't return 
> anything wrong, and just leaves a warning like "No accumulated counter 
> supported!", what do you think?

[HK]: I think printing values that look correct but is wrong is very 
misleading. If CP FW doesn't support it, sdma_utilization should report either 
0 or -1 along with a message in the kernel log. This should be case for 9.4.4 
and 9.5 until CP FW fixes for these ASICs. Apart from that everything else 
looks fine.

>> Some comments inline.
>>
>>
>> On 2026-06-04 13:45, Eric Huang wrote:
>>> since gfx 9.4.3 HW is calculating accumulated activity counter
>>> per-queue in register sdmax_rlcx_utilization_hi/lo, CPFW adds it in
>>> sdma MQD for save/restore, KFD will read it from there. gfx 9.4.2
>>> will still keep the way to read from memory at rptr+8.
>>>
>>> v2: read dynamic counter directly from utilization register
>>>
>>> Signed-off-by: Eric Huang <[email protected]>
>>> ---
>>>   .../drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c   | 51 ++++++++++++++++++-
>>>   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 23 +++++++--
>>>   drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 13 ++++-
>>>   .../include/asic_reg/sdma/sdma_4_4_2_offset.h |  4 ++
>>>   .../gpu/drm/amd/include/kgd_kfd_interface.h   |  2 +
>>>   drivers/gpu/drm/amd/include/v9_structs.h      |  4 +-
>>>   6 files changed, 89 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
>>> index f46c59118304..16bad244c091 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
>>> @@ -35,6 +35,8 @@
>>>   #include "sdma/sdma_4_4_2_sh_mask.h"
>>>   #include <uapi/linux/kfd_ioctl.h>
>>>   +#define SDMA_QUEUES_NUM_PER_ENG    8
>>> +
>>>   static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd)
>>>   {
>>>       return (struct v9_sdma_mqd *)mqd;
>>> @@ -584,6 +586,52 @@ static uint32_t kgd_v9_4_3_ptl_ctrl(struct 
>>> amdgpu_device *adev,
>>>               ptl_state, fmt1, fmt2);
>>>   }
>>>   +static int kgd_gfx_v9_4_3_hqd_sdma_get_counter(struct amdgpu_device 
>>> *adev,
>>> +                    void *mqd, uint64_t *val)
>>> +{
>>> +    struct v9_sdma_mqd *m = get_sdma_mqd(mqd);
>>> +    uint32_t sdma_rlc_reg_offset;
>>> +    uint32_t sdma_rlc_rb_cntl;
>>> +    uint32_t engine_id, queue_id;
>>> +    uint32_t engines = adev->sdma.num_instances;
>>> +    uint32_t sdma_rlcx_rb_base, sdma_rlcx_rb_base_hi;
>>> +    bool found = false;
>>> +
>>> +    if (!m)
>>> +        return -EINVAL;
>>> +
>>> +    for (engine_id = 0; engine_id < engines && !found; engine_id++) {
>>> +        for (queue_id = 0; queue_id < SDMA_QUEUES_NUM_PER_ENG; queue_id++) 
>>> {
>>> +            sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev,
>>> +                        engine_id, queue_id);
>> [HK]: sdma_rlc_reg_offset read could be move inside the if condition. Saves 
>> unncessary register read here.
> It won't work, sdma_rlc_reg_offset is needed in the next line for reading 
> sdm_rlcx_rb_base from register.
>>
>>> +            sdma_rlcx_rb_base = RREG32(sdma_rlc_reg_offset +
>>> +                        regSDMA_RLC0_RB_BASE);
>>> +            sdma_rlcx_rb_base_hi = RREG32(sdma_rlc_reg_offset +
>>> +                        regSDMA_RLC0_RB_BASE_HI);
>>> +
>>> +            if (m->sdmax_rlcx_rb_base == sdma_rlcx_rb_base &&
>>> +                m->sdmax_rlcx_rb_base_hi == sdma_rlcx_rb_base_hi) {
>>> +                found = true;
>>> +                break;
>>> +            }
>>> +        }
>>> +    }
>>> +
>> [HK]: Needs a if(!found) error handling.
> Because SDMA doesn't support oversubscription, there must be a HQD associated 
> with a MQD, found must be true here. I probably need to add the comment in 
> finding loop for that.
>>
>>
>>> +    sdma_rlc_rb_cntl = RREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_CNTL);
>>> +
>>> +    /* Read sdma activity counter from utilization register
>>> +     * if hw queue is enabled, otherwise read from MQD.
>>> +     */
>>> +    if (sdma_rlc_rb_cntl & SDMA_RLC0_RB_CNTL__RB_ENABLE_MASK)
>>> +        *val = (uint64_t)RREG32(sdma_rlc_reg_offset + 
>>> regSDMA_RLC0_UTILIZATION_HI) << 32 |
>>> +            RREG32(sdma_rlc_reg_offset + regSDMA_RLC0_UTILIZATION_LO);
>>> +    else
>>> +        *val = (uint64_t)m->sdmax_rlcx_utilization_hi << 32 |
>>> +            m->sdmax_rlcx_utilization_lo;
>>> +
>>> +    return 0;
>>> +}
>>> +
>>>   const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = {
>>>       .program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
>>>       .set_pasid_vmid_mapping = kgd_gfx_v9_4_3_set_pasid_vmid_mapping,
>>> @@ -623,5 +671,6 @@ const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = {
>>>       .trigger_pc_sample_trap = kgd_v9_4_3_trigger_pc_sample_trap,
>>>       .override_core_cg = kgd_gfx_v9_4_3_override_core_cg,
>>>       .setup_stoch_sampling = kgd_v9_4_3_setup_stoch_sampling,
>>> -    .ptl_ctrl = kgd_v9_4_3_ptl_ctrl
>>> +    .ptl_ctrl = kgd_v9_4_3_ptl_ctrl,
>>> +    .hqd_sdma_get_counter = kgd_gfx_v9_4_3_hqd_sdma_get_counter
>>>   };
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> index b934863312d0..a65161659f74 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> @@ -1067,8 +1067,15 @@ static int destroy_queue_nocpsch(struct 
>>> device_queue_manager *dqm,
>>>       /* Get the SDMA queue stats */
>>>       if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
>>>           (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
>>> -        retval = read_sdma_queue_counter((uint64_t __user 
>>> *)q->properties.read_ptr,
>>> -                            &sdma_val);
>>> +        if ((KFD_GC_VERSION(dqm->dev) <= IP_VERSION(9, 4, 2)))
>>> +            retval = read_sdma_queue_counter(
>>> +                    (uint64_t __user *)q->properties.read_ptr,
>>> +                    &sdma_val);
>>> +        else
>>> +            retval = dqm->dev->kfd2kgd->hqd_sdma_get_counter ?
>>> +                 dqm->dev->kfd2kgd->hqd_sdma_get_counter(
>>> +                    dqm->dev->adev, q->mqd, &sdma_val) :
>>> +                 0;
>> [HK]: What ahout 9.4.4 and 9.5? Do we we support those now? Otherwise, it 
>> will silently report 0 and not error.
> gc_9_4_3_kfd2kgd funcs are shared with gfx 9.4.4 and 9.5. The only thing I 
> think is return -ENOTSUP instead of 0, if no function is found here. what do 
> you think?
>>
>>>           if (retval)
>>>               dev_err(dev, "Failed to read SDMA queue counter for queue: 
>>> %d\n",
>>>                   q->properties.queue_id);
>>> @@ -2728,8 +2735,16 @@ static int destroy_queue_cpsch(struct 
>>> device_queue_manager *dqm,
>>>       /* Get the SDMA queue stats */
>>>       if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
>>>           (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
>>> -        retval = read_sdma_queue_counter((uint64_t __user 
>>> *)q->properties.read_ptr,
>>> -                            &sdma_val);
>>> +        if (KFD_GC_VERSION(dqm->dev) <= IP_VERSION(9, 4, 2))
>>> +            retval = read_sdma_queue_counter(
>>> +                    (uint64_t __user *)q->properties.read_ptr,
>>> +                    &sdma_val);
>>> +        else
>>> +            retval = dqm->dev->kfd2kgd->hqd_sdma_get_counter ?
>>> +                 dqm->dev->kfd2kgd->hqd_sdma_get_counter(
>>> +                    dqm->dev->adev, q->mqd, &sdma_val) :
>>> +                 0;
>>> +
>>>           if (retval)
>>>               dev_err(dev, "Failed to read SDMA queue counter for queue: 
>>> %d\n",
>>>                   q->properties.queue_id);
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> index 0be2fd04e6d0..911f974e6bf5 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> @@ -95,6 +95,7 @@ struct kfd_sdma_activity_handler_workarea {
>>>     struct temp_sdma_queue_list {
>>>       uint64_t __user *rptr;
>>> +    void *mqd;
>>>       uint64_t sdma_val;
>>>       unsigned int queue_id;
>>>       struct list_head list;
>>> @@ -165,6 +166,7 @@ static void kfd_sdma_activity_worker(struct work_struct 
>>> *work)
>>>             INIT_LIST_HEAD(&sdma_q->list);
>>>           sdma_q->rptr = (uint64_t __user *)q->properties.read_ptr;
>>> +        sdma_q->mqd = q->mqd;
>>>           sdma_q->queue_id = q->properties.queue_id;
>>>           list_add_tail(&sdma_q->list, &sdma_q_list.list);
>>>       }
>>> @@ -193,7 +195,16 @@ static void kfd_sdma_activity_worker(struct 
>>> work_struct *work)
>>>         list_for_each_entry(sdma_q, &sdma_q_list.list, list) {
>>>           val = 0;
>>> -        ret = read_sdma_queue_counter(sdma_q->rptr, &val);
>>> +
>>> +        if ((KFD_GC_VERSION(dqm->dev) <= IP_VERSION(9, 4, 2)))
>>> +            ret = read_sdma_queue_counter(sdma_q->rptr, &val);
>>> +        else
>>> +            ret = dqm->dev->kfd2kgd->hqd_sdma_get_counter ?
>>> +                  dqm->dev->kfd2kgd->hqd_sdma_get_counter(
>>> +                    dqm->dev->adev,
>>> +                    sdma_q->mqd, &val) :
>>> +                  0;
>>> +
>>>           if (ret) {
>>>               pr_debug("Failed to read SDMA queue active counter for queue 
>>> id: %d",
>>>                    sdma_q->queue_id);
>>> diff --git a/drivers/gpu/drm/amd/include/asic_reg/sdma/sdma_4_4_2_offset.h 
>>> b/drivers/gpu/drm/amd/include/asic_reg/sdma/sdma_4_4_2_offset.h
>>> index ead81aeffd67..8700f8190c7c 100644
>>> --- a/drivers/gpu/drm/amd/include/asic_reg/sdma/sdma_4_4_2_offset.h
>>> +++ b/drivers/gpu/drm/amd/include/asic_reg/sdma/sdma_4_4_2_offset.h
>>> @@ -493,6 +493,10 @@
>>>   #define regSDMA_RLC0_MIDCMD_DATA10_BASE_IDX                               
>>>                               0
>>>   #define regSDMA_RLC0_MIDCMD_CNTL                                          
>>>                               0x017b
>>>   #define regSDMA_RLC0_MIDCMD_CNTL_BASE_IDX                                 
>>>                               0
>>> +#define regSDMA_RLC0_UTILIZATION_LO                                    
>>> 0x017c
>>> +#define regSDMA_RLC0_UTILIZATION_LO_BASE_IDX                               
>>>  0
>>> +#define regSDMA_RLC0_UTILIZATION_HI                                    
>>> 0x017d
>>> +#define regSDMA_RLC0_UTILIZATION_HI_BASE_IDX                               
>>>  0
>>>   #define regSDMA_RLC1_RB_CNTL                                              
>>>                               0x018
>> [HK]: Formatting error. You need to use space instead of tabs, I guess.
> Sure.
>>
>>
>> 8
>>>   #define regSDMA_RLC1_RB_CNTL_BASE_IDX                                     
>>>                               0
>>>   #define regSDMA_RLC1_RB_BASE                                              
>>>                               0x0189
>>> diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h 
>>> b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
>>> index d34c869b182f..f3220794c108 100644
>>> --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
>>> +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
>>> @@ -361,6 +361,8 @@ struct kfd2kgd_calls {
>>>                    uint32_t *ptl_state,
>>>                    enum amdgpu_ptl_fmt *fmt1,
>>>                    enum amdgpu_ptl_fmt *fmt2);
>>> +    int (*hqd_sdma_get_counter)(struct amdgpu_device *adev,
>>> +                    void *mqd, uint64_t *val);
>> [HK]: #define SDMA_QUEUES_NUM_PER_ENG    8 <-- We already hard code this 
>> value in KFD. I think #define could be avoided if you pass in 
>> number_of_sdma_queues_per_engine as a parameter.
> OK.
> 
> Regards,
> Eric
>>
>>
>>>   };
>>>     #endif    /* KGD_KFD_INTERFACE_H_INCLUDED */
>>> diff --git a/drivers/gpu/drm/amd/include/v9_structs.h 
>>> b/drivers/gpu/drm/amd/include/v9_structs.h
>>> index a2f81b9c38af..e0d387f08576 100644
>>> --- a/drivers/gpu/drm/amd/include/v9_structs.h
>>> +++ b/drivers/gpu/drm/amd/include/v9_structs.h
>>> @@ -69,8 +69,8 @@ struct v9_sdma_mqd {
>>>       uint32_t sdmax_rlcx_midcmd_cntl;
>>>       uint32_t reserved_42;
>>>       uint32_t reserved_43;
>>> -    uint32_t reserved_44;
>>> -    uint32_t reserved_45;
>>> +    uint32_t sdmax_rlcx_utilization_lo;
>>> +    uint32_t sdmax_rlcx_utilization_hi;
>>>       uint32_t reserved_46;
>>>       uint32_t reserved_47;
>>>       uint32_t reserved_48;
> 

Reply via email to