On Thu, Mar 26, 2026 at 12:37 AM Lazar, Lijo <[email protected]> wrote: > > > > On 26-Mar-26 7:34 AM, Yang Wang wrote: > > PMFW may return invalid values due to internal calculation errors. > > so, the kmd driver must validate and sanitize the returned values to > > prevent issues caused by firmware calculation errors. > > > > For example, values 0xfffe (-2) and 0xffff (-1) are treated > > as invalid and clamped to 0. > > > > The problem with clamping is that the issue takes a different direction > after that. > > Presently, the issue is reported as garbage values reported in activity. > With clamping, the issue could get reported as 100% activity with light > load or 0% activity with a heavy load. That will take the debug in a > different direction. > > Instead, isn't it better to keep this as some errata and let user apps > filter out garbage values? The previous or next sample could reflect the > activity correctly. Later fix can be added to newer firmware, if that is > an option.
What about returning -EBUSY or -EAGAIN if the value is negative? Alex > > Thanks, > Lijo > > > > this applies to devices with CAB (Cache As Buffer) functionality. > > > > Closes: https://gitlab.freedesktop.org/drm/amd/-/work_items/4905 > > > > Signed-off-by: Yang Wang <[email protected]> > > --- > > drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 17 +++++++++++++++++ > > .../drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c | 10 +++++----- > > .../drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c | 10 +++++----- > > .../drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c | 10 +++++----- > > 4 files changed, 32 insertions(+), 15 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h > > b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h > > index 609f5ab07d8a..365946c43e11 100644 > > --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h > > +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h > > @@ -2164,4 +2164,21 @@ static inline void smu_feature_init(struct > > smu_context *smu, int feature_num) > > smu_feature_list_clear_all(smu, SMU_FEATURE_LIST_ALLOWED); > > } > > > > +/* > > + * smu_safe_u16_nn - Make u16 safe by filtering negative overflow errors > > + * @val: Input u16 value, may contain invalid negative overflows > > + * > > + * Convert u16 to non-negative value. Cast to s16 to detect negative values > > + * caused by calculation errors. Return 0 for negative errors, return > > + * original value if valid. > > + * > > + * Return: Valid u16 value or 0 > > + */ > > +static inline u16 smu_safe_u16_nn(u16 val) > > +{ > > + s16 tmp = (s16)val; > > + > > + return tmp < 0 ? 0 : val; > > +} > > + > > #endif > > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c > > b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c > > index 9be7a2af560d..16f69b548ca4 100644 > > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c > > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c > > @@ -774,13 +774,13 @@ static int smu_v13_0_0_get_smu_metrics_data(struct > > smu_context *smu, > > *value = metrics->AverageGfxclkFrequencyPreDs; > > break; > > case METRICS_AVERAGE_FCLK: > > - if (metrics->AverageUclkActivity <= SMU_13_0_0_BUSY_THRESHOLD) > > + if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= > > SMU_13_0_0_BUSY_THRESHOLD) > > *value = metrics->AverageFclkFrequencyPostDs; > > else > > *value = metrics->AverageFclkFrequencyPreDs; > > break; > > case METRICS_AVERAGE_UCLK: > > - if (metrics->AverageUclkActivity <= SMU_13_0_0_BUSY_THRESHOLD) > > + if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= > > SMU_13_0_0_BUSY_THRESHOLD) > > *value = metrics->AverageMemclkFrequencyPostDs; > > else > > *value = metrics->AverageMemclkFrequencyPreDs; > > @@ -801,7 +801,7 @@ static int smu_v13_0_0_get_smu_metrics_data(struct > > smu_context *smu, > > *value = metrics->AverageGfxActivity; > > break; > > case METRICS_AVERAGE_MEMACTIVITY: > > - *value = metrics->AverageUclkActivity; > > + *value = smu_safe_u16_nn(metrics->AverageUclkActivity); > > break; > > case METRICS_AVERAGE_VCNACTIVITY: > > *value = max(metrics->Vcn0ActivityPercentage, > > @@ -2086,7 +2086,7 @@ static ssize_t smu_v13_0_0_get_gpu_metrics(struct > > smu_context *smu, > > > > metrics->AvgTemperature[TEMP_VR_MEM1]); > > > > gpu_metrics->average_gfx_activity = metrics->AverageGfxActivity; > > - gpu_metrics->average_umc_activity = metrics->AverageUclkActivity; > > + gpu_metrics->average_umc_activity = > > smu_safe_u16_nn(metrics->AverageUclkActivity); > > gpu_metrics->average_mm_activity = > > max(metrics->Vcn0ActivityPercentage, > > > > metrics->Vcn1ActivityPercentage); > > > > @@ -2103,7 +2103,7 @@ static ssize_t smu_v13_0_0_get_gpu_metrics(struct > > smu_context *smu, > > else > > gpu_metrics->average_gfxclk_frequency = > > metrics->AverageGfxclkFrequencyPreDs; > > > > - if (metrics->AverageUclkActivity <= SMU_13_0_0_BUSY_THRESHOLD) > > + if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= > > SMU_13_0_0_BUSY_THRESHOLD) > > gpu_metrics->average_uclk_frequency = > > metrics->AverageMemclkFrequencyPostDs; > > else > > gpu_metrics->average_uclk_frequency = > > metrics->AverageMemclkFrequencyPreDs; > > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c > > b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c > > index 5cc15545da6e..34a5973b9a06 100644 > > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c > > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c > > @@ -784,13 +784,13 @@ static int smu_v13_0_7_get_smu_metrics_data(struct > > smu_context *smu, > > *value = metrics->AverageGfxclkFrequencyPreDs; > > break; > > case METRICS_AVERAGE_FCLK: > > - if (metrics->AverageUclkActivity <= SMU_13_0_7_BUSY_THRESHOLD) > > + if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= > > SMU_13_0_7_BUSY_THRESHOLD) > > *value = metrics->AverageFclkFrequencyPostDs; > > else > > *value = metrics->AverageFclkFrequencyPreDs; > > break; > > case METRICS_AVERAGE_UCLK: > > - if (metrics->AverageUclkActivity <= SMU_13_0_7_BUSY_THRESHOLD) > > + if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= > > SMU_13_0_7_BUSY_THRESHOLD) > > *value = metrics->AverageMemclkFrequencyPostDs; > > else > > *value = metrics->AverageMemclkFrequencyPreDs; > > @@ -815,7 +815,7 @@ static int smu_v13_0_7_get_smu_metrics_data(struct > > smu_context *smu, > > *value = metrics->AverageGfxActivity; > > break; > > case METRICS_AVERAGE_MEMACTIVITY: > > - *value = metrics->AverageUclkActivity; > > + *value = smu_safe_u16_nn(metrics->AverageUclkActivity); > > break; > > case METRICS_AVERAGE_SOCKETPOWER: > > *value = metrics->AverageSocketPower << 8; > > @@ -2092,7 +2092,7 @@ static ssize_t smu_v13_0_7_get_gpu_metrics(struct > > smu_context *smu, > > > > metrics->AvgTemperature[TEMP_VR_MEM1]); > > > > gpu_metrics->average_gfx_activity = metrics->AverageGfxActivity; > > - gpu_metrics->average_umc_activity = metrics->AverageUclkActivity; > > + gpu_metrics->average_umc_activity = > > smu_safe_u16_nn(metrics->AverageUclkActivity); > > gpu_metrics->average_mm_activity = > > max(metrics->Vcn0ActivityPercentage, > > > > metrics->Vcn1ActivityPercentage); > > > > @@ -2105,7 +2105,7 @@ static ssize_t smu_v13_0_7_get_gpu_metrics(struct > > smu_context *smu, > > else > > gpu_metrics->average_gfxclk_frequency = > > metrics->AverageGfxclkFrequencyPreDs; > > > > - if (metrics->AverageUclkActivity <= SMU_13_0_7_BUSY_THRESHOLD) > > + if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= > > SMU_13_0_7_BUSY_THRESHOLD) > > gpu_metrics->average_uclk_frequency = > > metrics->AverageMemclkFrequencyPostDs; > > else > > gpu_metrics->average_uclk_frequency = > > metrics->AverageMemclkFrequencyPreDs; > > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c > > b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c > > index 28c1b084fe62..aaec3a251e0f 100644 > > --- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c > > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c > > @@ -660,13 +660,13 @@ static int smu_v14_0_2_get_smu_metrics_data(struct > > smu_context *smu, > > *value = metrics->AverageGfxclkFrequencyPreDs; > > break; > > case METRICS_AVERAGE_FCLK: > > - if (metrics->AverageUclkActivity <= SMU_14_0_2_BUSY_THRESHOLD) > > + if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= > > SMU_14_0_2_BUSY_THRESHOLD) > > *value = metrics->AverageFclkFrequencyPostDs; > > else > > *value = metrics->AverageFclkFrequencyPreDs; > > break; > > case METRICS_AVERAGE_UCLK: > > - if (metrics->AverageUclkActivity <= SMU_14_0_2_BUSY_THRESHOLD) > > + if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= > > SMU_14_0_2_BUSY_THRESHOLD) > > *value = metrics->AverageMemclkFrequencyPostDs; > > else > > *value = metrics->AverageMemclkFrequencyPreDs; > > @@ -687,7 +687,7 @@ static int smu_v14_0_2_get_smu_metrics_data(struct > > smu_context *smu, > > *value = metrics->AverageGfxActivity; > > break; > > case METRICS_AVERAGE_MEMACTIVITY: > > - *value = metrics->AverageUclkActivity; > > + *value = smu_safe_u16_nn(metrics->AverageUclkActivity); > > break; > > case METRICS_AVERAGE_VCNACTIVITY: > > *value = max(metrics->AverageVcn0ActivityPercentage, > > @@ -2146,7 +2146,7 @@ static ssize_t smu_v14_0_2_get_gpu_metrics(struct > > smu_context *smu, > > > > metrics->AvgTemperature[TEMP_VR_MEM1]); > > > > gpu_metrics->average_gfx_activity = metrics->AverageGfxActivity; > > - gpu_metrics->average_umc_activity = metrics->AverageUclkActivity; > > + gpu_metrics->average_umc_activity = > > smu_safe_u16_nn(metrics->AverageUclkActivity); > > gpu_metrics->average_mm_activity = > > max(metrics->AverageVcn0ActivityPercentage, > > > > metrics->Vcn1ActivityPercentage); > > > > @@ -2158,7 +2158,7 @@ static ssize_t smu_v14_0_2_get_gpu_metrics(struct > > smu_context *smu, > > else > > gpu_metrics->average_gfxclk_frequency = > > metrics->AverageGfxclkFrequencyPreDs; > > > > - if (metrics->AverageUclkActivity <= SMU_14_0_2_BUSY_THRESHOLD) > > + if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= > > SMU_14_0_2_BUSY_THRESHOLD) > > gpu_metrics->average_uclk_frequency = > > metrics->AverageMemclkFrequencyPostDs; > > else > > gpu_metrics->average_uclk_frequency = > > metrics->AverageMemclkFrequencyPreDs; >
