On 04-Mar-26 1:49 PM, Asad Kamal wrote:
Add aid, xcd & hbm temperatures to gpu metrics for smu_v13_0_12

v2: Use correct umc control per stack (Lijo)

Signed-off-by: Asad Kamal <[email protected]>

Reviewed-by: Lijo Lazar <[email protected]>

Thanks,
Lijo

---
  .../gpu/drm/amd/include/kgd_pp_interface.h    |  3 ++
  .../drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c | 37 ++++++++++++++++++-
  .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h  | 13 ++++++-
  3 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/include/kgd_pp_interface.h 
b/drivers/gpu/drm/amd/include/kgd_pp_interface.h
index bdf8e6ff556c..a9b73f4fd466 100644
--- a/drivers/gpu/drm/amd/include/kgd_pp_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_pp_interface.h
@@ -584,6 +584,9 @@ enum amdgpu_metrics_attr_id {
        AMDGPU_METRICS_ATTR_ID_GFX_BELOW_HOST_LIMIT_THM_ACC,
        AMDGPU_METRICS_ATTR_ID_GFX_LOW_UTILIZATION_ACC,
        AMDGPU_METRICS_ATTR_ID_GFX_BELOW_HOST_LIMIT_TOTAL_ACC,
+       AMDGPU_METRICS_ATTR_ID_TEMPERATURE_HBM,
+       AMDGPU_METRICS_ATTR_ID_TEMPERATURE_AID,
+       AMDGPU_METRICS_ATTR_ID_TEMPERATURE_XCD,
        AMDGPU_METRICS_ATTR_ID_MAX,
  };
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c
index f2a6ecb64c03..96a58d43db53 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c
@@ -49,6 +49,13 @@
  #undef pr_info
  #undef pr_debug
+#define hbm_stack_mask_valid(umc_mask) \
+       (((umc_mask) & 0x3) == 0x3)
+
+#define for_each_hbm_stack(stack_idx, umc_mask) \
+       for ((stack_idx) = 0; (umc_mask); \
+            (umc_mask) >>= 2, (stack_idx)++) \
+
  #define SMU_13_0_12_FEA_MAP(smu_feature, smu_13_0_12_feature)                 
   \
        [smu_feature] = { 1, (smu_13_0_12_feature) }
@@ -834,7 +841,7 @@ void smu_v13_0_12_get_gpu_metrics(struct smu_context *smu, void **table,
                                  struct smu_v13_0_6_gpu_metrics *gpu_metrics)
  {
        struct amdgpu_device *adev = smu->adev;
-       int ret = 0, xcc_id, inst, i, j;
+       int ret = 0, xcc_id, inst, i, j, idx;
        u8 num_jpeg_rings_gpu_metrics;
        MetricsTable_t *metrics;
@@ -849,6 +856,31 @@ void smu_v13_0_12_get_gpu_metrics(struct smu_context *smu, void **table,
        gpu_metrics->temperature_vrsoc =
                SMUQ10_ROUND(metrics->MaxVrTemperature);
+ if (smu_v13_0_6_cap_supported(smu,
+                                     SMU_CAP(TEMP_AID_XCD_HBM))) {
+               if (adev->umc.active_mask) {
+                       u64 mask = adev->umc.active_mask;
+                       int out_idx = 0;
+                       int stack_idx;
+
+                       if (unlikely(hweight64(mask) / 2 > 
SMU_13_0_6_MAX_HBM_STACKS)) {
+                               dev_warn(adev->dev, "Invalid umc mask %lld\n", 
mask);
+                       } else  {
+                               for_each_hbm_stack(stack_idx, mask) {
+                                       if (!hbm_stack_mask_valid(mask))
+                                               continue;
+                                       gpu_metrics->temperature_hbm[out_idx++] 
=
+                                               
metrics->HbmTemperature[stack_idx];
+                               }
+                       }
+               }
+               idx = 0;
+               for_each_inst(i, adev->aid_mask) {
+                       gpu_metrics->temperature_aid[idx] = 
metrics->AidTemperature[i];
+                       idx++;
+               }
+       }
+
        gpu_metrics->average_gfx_activity =
                SMUQ10_ROUND(metrics->SocketGfxBusy);
        gpu_metrics->average_umc_activity =
@@ -964,6 +996,9 @@ void smu_v13_0_12_get_gpu_metrics(struct smu_context *smu, 
void **table,
                                [i] = SMUQ10_ROUND(
                                metrics->GfxclkBelowHostLimitTotalAcc[inst]);
                }
+               if (smu_v13_0_6_cap_supported(smu,
+                                             SMU_CAP(TEMP_AID_XCD_HBM)))
+                       gpu_metrics->temperature_xcd[i] = 
metrics->XcdTemperature[inst];
        }
gpu_metrics->xgmi_link_width = metrics->XgmiWidth;
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h
index ffb06564f830..a150fc88902c 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h
@@ -78,6 +78,7 @@ enum smu_v13_0_6_caps {
        SMU_CAP(RAS_EEPROM),
        SMU_CAP(FAST_PPT),
        SMU_CAP(SYSTEM_POWER_METRICS),
+       SMU_CAP(TEMP_AID_XCD_HBM),
        SMU_CAP(ALL),
  };
@@ -87,6 +88,8 @@ enum smu_v13_0_6_caps {
  #define SMU_13_0_6_MAX_XCC 8
  #define SMU_13_0_6_MAX_VCN 4
  #define SMU_13_0_6_MAX_JPEG 40
+#define SMU_13_0_6_MAX_AID 4
+#define SMU_13_0_6_MAX_HBM_STACKS 8
extern void smu_v13_0_6_set_ppt_funcs(struct smu_context *smu);
  bool smu_v13_0_6_cap_supported(struct smu_context *smu, enum smu_v13_0_6_caps 
cap);
@@ -222,7 +225,15 @@ extern const struct ras_smu_drv smu_v13_0_12_ras_smu_drv;
                  SMU_13_0_6_MAX_XCC);                                         \
        SMU_ARRAY(SMU_MATTR(GFX_BELOW_HOST_LIMIT_TOTAL_ACC), SMU_MUNIT(NONE),  \
                  SMU_MTYPE(U64), gfx_below_host_limit_total_acc,              \
-                 SMU_13_0_6_MAX_XCC);
+                 SMU_13_0_6_MAX_XCC);                                         \
+       SMU_ARRAY(SMU_MATTR(TEMPERATURE_HBM), SMU_MUNIT(TEMP_1),               \
+                 SMU_MTYPE(U16), temperature_hbm,                             \
+                 SMU_13_0_6_MAX_HBM_STACKS);                                  \
+       SMU_ARRAY(SMU_MATTR(TEMPERATURE_AID), SMU_MUNIT(TEMP_1),               \
+                 SMU_MTYPE(U16), temperature_aid, SMU_13_0_6_MAX_AID);        \
+       SMU_ARRAY(SMU_MATTR(TEMPERATURE_XCD), SMU_MUNIT(TEMP_1),               \
+                 SMU_MTYPE(U16), temperature_xcd, SMU_13_0_6_MAX_XCC);        \
+
DECLARE_SMU_METRICS_CLASS(smu_v13_0_6_gpu_metrics, SMU_13_0_6_METRICS_FIELDS);
  void smu_v13_0_12_get_gpu_metrics(struct smu_context *smu, void **table,

Reply via email to