On Thu, Mar 26, 2026 at 12:37 AM Lazar, Lijo <[email protected]> wrote:
>
>
>
> On 26-Mar-26 7:34 AM, Yang Wang wrote:
> > PMFW may return invalid values due to internal calculation errors.
> > so, the kmd driver must validate and sanitize the returned values to
> > prevent issues caused by firmware calculation errors.
> >
> > For example, values 0xfffe (-2) and 0xffff (-1) are treated
> > as invalid and clamped to 0.
> >
>
> The problem with clamping is that the issue takes a different direction
> after that.
>
> Presently, the issue is reported as garbage values reported in activity.
> With clamping, the issue could get reported as 100% activity with light
> load or 0% activity with a heavy load. That will take the debug in a
> different direction.
>
> Instead, isn't it better to keep this as some errata and let user apps
> filter out garbage values? The previous or next sample could reflect the
> activity correctly. Later fix can be added to newer firmware, if that is
> an option.

What about returning -EBUSY or -EAGAIN if the value is negative?

Alex

>
> Thanks,
> Lijo
>
>
> > this applies to devices with CAB (Cache As Buffer) functionality.
> >
> > Closes: https://gitlab.freedesktop.org/drm/amd/-/work_items/4905
> >
> > Signed-off-by: Yang Wang <[email protected]>
> > ---
> >   drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h   | 17 +++++++++++++++++
> >   .../drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c    | 10 +++++-----
> >   .../drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c    | 10 +++++-----
> >   .../drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c    | 10 +++++-----
> >   4 files changed, 32 insertions(+), 15 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h 
> > b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> > index 609f5ab07d8a..365946c43e11 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> > @@ -2164,4 +2164,21 @@ static inline void smu_feature_init(struct 
> > smu_context *smu, int feature_num)
> >       smu_feature_list_clear_all(smu, SMU_FEATURE_LIST_ALLOWED);
> >   }
> >
> > +/*
> > + * smu_safe_u16_nn - Make u16 safe by filtering negative overflow errors
> > + * @val: Input u16 value, may contain invalid negative overflows
> > + *
> > + * Convert u16 to non-negative value. Cast to s16 to detect negative values
> > + * caused by calculation errors. Return 0 for negative errors, return
> > + * original value if valid.
> > + *
> > + * Return: Valid u16 value or 0
> > + */
> > +static inline u16 smu_safe_u16_nn(u16 val)
> > +{
> > +    s16 tmp = (s16)val;
> > +
> > +    return tmp < 0 ? 0 : val;
> > +}
> > +
> >   #endif
> > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c 
> > b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
> > index 9be7a2af560d..16f69b548ca4 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
> > @@ -774,13 +774,13 @@ static int smu_v13_0_0_get_smu_metrics_data(struct 
> > smu_context *smu,
> >                       *value = metrics->AverageGfxclkFrequencyPreDs;
> >               break;
> >       case METRICS_AVERAGE_FCLK:
> > -             if (metrics->AverageUclkActivity <= SMU_13_0_0_BUSY_THRESHOLD)
> > +             if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= 
> > SMU_13_0_0_BUSY_THRESHOLD)
> >                       *value = metrics->AverageFclkFrequencyPostDs;
> >               else
> >                       *value = metrics->AverageFclkFrequencyPreDs;
> >               break;
> >       case METRICS_AVERAGE_UCLK:
> > -             if (metrics->AverageUclkActivity <= SMU_13_0_0_BUSY_THRESHOLD)
> > +             if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= 
> > SMU_13_0_0_BUSY_THRESHOLD)
> >                       *value = metrics->AverageMemclkFrequencyPostDs;
> >               else
> >                       *value = metrics->AverageMemclkFrequencyPreDs;
> > @@ -801,7 +801,7 @@ static int smu_v13_0_0_get_smu_metrics_data(struct 
> > smu_context *smu,
> >               *value = metrics->AverageGfxActivity;
> >               break;
> >       case METRICS_AVERAGE_MEMACTIVITY:
> > -             *value = metrics->AverageUclkActivity;
> > +             *value = smu_safe_u16_nn(metrics->AverageUclkActivity);
> >               break;
> >       case METRICS_AVERAGE_VCNACTIVITY:
> >               *value = max(metrics->Vcn0ActivityPercentage,
> > @@ -2086,7 +2086,7 @@ static ssize_t smu_v13_0_0_get_gpu_metrics(struct 
> > smu_context *smu,
> >                                            
> > metrics->AvgTemperature[TEMP_VR_MEM1]);
> >
> >       gpu_metrics->average_gfx_activity = metrics->AverageGfxActivity;
> > -     gpu_metrics->average_umc_activity = metrics->AverageUclkActivity;
> > +     gpu_metrics->average_umc_activity = 
> > smu_safe_u16_nn(metrics->AverageUclkActivity);
> >       gpu_metrics->average_mm_activity = 
> > max(metrics->Vcn0ActivityPercentage,
> >                                              
> > metrics->Vcn1ActivityPercentage);
> >
> > @@ -2103,7 +2103,7 @@ static ssize_t smu_v13_0_0_get_gpu_metrics(struct 
> > smu_context *smu,
> >       else
> >               gpu_metrics->average_gfxclk_frequency = 
> > metrics->AverageGfxclkFrequencyPreDs;
> >
> > -     if (metrics->AverageUclkActivity <= SMU_13_0_0_BUSY_THRESHOLD)
> > +     if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= 
> > SMU_13_0_0_BUSY_THRESHOLD)
> >               gpu_metrics->average_uclk_frequency = 
> > metrics->AverageMemclkFrequencyPostDs;
> >       else
> >               gpu_metrics->average_uclk_frequency = 
> > metrics->AverageMemclkFrequencyPreDs;
> > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c 
> > b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c
> > index 5cc15545da6e..34a5973b9a06 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c
> > @@ -784,13 +784,13 @@ static int smu_v13_0_7_get_smu_metrics_data(struct 
> > smu_context *smu,
> >               *value = metrics->AverageGfxclkFrequencyPreDs;
> >               break;
> >       case METRICS_AVERAGE_FCLK:
> > -             if (metrics->AverageUclkActivity <= SMU_13_0_7_BUSY_THRESHOLD)
> > +             if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= 
> > SMU_13_0_7_BUSY_THRESHOLD)
> >                       *value = metrics->AverageFclkFrequencyPostDs;
> >               else
> >                       *value = metrics->AverageFclkFrequencyPreDs;
> >               break;
> >       case METRICS_AVERAGE_UCLK:
> > -             if (metrics->AverageUclkActivity <= SMU_13_0_7_BUSY_THRESHOLD)
> > +             if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= 
> > SMU_13_0_7_BUSY_THRESHOLD)
> >                       *value = metrics->AverageMemclkFrequencyPostDs;
> >               else
> >                       *value = metrics->AverageMemclkFrequencyPreDs;
> > @@ -815,7 +815,7 @@ static int smu_v13_0_7_get_smu_metrics_data(struct 
> > smu_context *smu,
> >               *value = metrics->AverageGfxActivity;
> >               break;
> >       case METRICS_AVERAGE_MEMACTIVITY:
> > -             *value = metrics->AverageUclkActivity;
> > +             *value = smu_safe_u16_nn(metrics->AverageUclkActivity);
> >               break;
> >       case METRICS_AVERAGE_SOCKETPOWER:
> >               *value = metrics->AverageSocketPower << 8;
> > @@ -2092,7 +2092,7 @@ static ssize_t smu_v13_0_7_get_gpu_metrics(struct 
> > smu_context *smu,
> >                                            
> > metrics->AvgTemperature[TEMP_VR_MEM1]);
> >
> >       gpu_metrics->average_gfx_activity = metrics->AverageGfxActivity;
> > -     gpu_metrics->average_umc_activity = metrics->AverageUclkActivity;
> > +     gpu_metrics->average_umc_activity = 
> > smu_safe_u16_nn(metrics->AverageUclkActivity);
> >       gpu_metrics->average_mm_activity = 
> > max(metrics->Vcn0ActivityPercentage,
> >                                              
> > metrics->Vcn1ActivityPercentage);
> >
> > @@ -2105,7 +2105,7 @@ static ssize_t smu_v13_0_7_get_gpu_metrics(struct 
> > smu_context *smu,
> >       else
> >               gpu_metrics->average_gfxclk_frequency = 
> > metrics->AverageGfxclkFrequencyPreDs;
> >
> > -     if (metrics->AverageUclkActivity <= SMU_13_0_7_BUSY_THRESHOLD)
> > +     if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= 
> > SMU_13_0_7_BUSY_THRESHOLD)
> >               gpu_metrics->average_uclk_frequency = 
> > metrics->AverageMemclkFrequencyPostDs;
> >       else
> >               gpu_metrics->average_uclk_frequency = 
> > metrics->AverageMemclkFrequencyPreDs;
> > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c 
> > b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c
> > index 28c1b084fe62..aaec3a251e0f 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c
> > @@ -660,13 +660,13 @@ static int smu_v14_0_2_get_smu_metrics_data(struct 
> > smu_context *smu,
> >                       *value = metrics->AverageGfxclkFrequencyPreDs;
> >               break;
> >       case METRICS_AVERAGE_FCLK:
> > -             if (metrics->AverageUclkActivity <= SMU_14_0_2_BUSY_THRESHOLD)
> > +             if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= 
> > SMU_14_0_2_BUSY_THRESHOLD)
> >                       *value = metrics->AverageFclkFrequencyPostDs;
> >               else
> >                       *value = metrics->AverageFclkFrequencyPreDs;
> >               break;
> >       case METRICS_AVERAGE_UCLK:
> > -             if (metrics->AverageUclkActivity <= SMU_14_0_2_BUSY_THRESHOLD)
> > +             if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= 
> > SMU_14_0_2_BUSY_THRESHOLD)
> >                       *value = metrics->AverageMemclkFrequencyPostDs;
> >               else
> >                       *value = metrics->AverageMemclkFrequencyPreDs;
> > @@ -687,7 +687,7 @@ static int smu_v14_0_2_get_smu_metrics_data(struct 
> > smu_context *smu,
> >               *value = metrics->AverageGfxActivity;
> >               break;
> >       case METRICS_AVERAGE_MEMACTIVITY:
> > -             *value = metrics->AverageUclkActivity;
> > +             *value = smu_safe_u16_nn(metrics->AverageUclkActivity);
> >               break;
> >       case METRICS_AVERAGE_VCNACTIVITY:
> >               *value = max(metrics->AverageVcn0ActivityPercentage,
> > @@ -2146,7 +2146,7 @@ static ssize_t smu_v14_0_2_get_gpu_metrics(struct 
> > smu_context *smu,
> >                                            
> > metrics->AvgTemperature[TEMP_VR_MEM1]);
> >
> >       gpu_metrics->average_gfx_activity = metrics->AverageGfxActivity;
> > -     gpu_metrics->average_umc_activity = metrics->AverageUclkActivity;
> > +     gpu_metrics->average_umc_activity = 
> > smu_safe_u16_nn(metrics->AverageUclkActivity);
> >       gpu_metrics->average_mm_activity = 
> > max(metrics->AverageVcn0ActivityPercentage,
> >                                              
> > metrics->Vcn1ActivityPercentage);
> >
> > @@ -2158,7 +2158,7 @@ static ssize_t smu_v14_0_2_get_gpu_metrics(struct 
> > smu_context *smu,
> >       else
> >               gpu_metrics->average_gfxclk_frequency = 
> > metrics->AverageGfxclkFrequencyPreDs;
> >
> > -     if (metrics->AverageUclkActivity <= SMU_14_0_2_BUSY_THRESHOLD)
> > +     if (smu_safe_u16_nn(metrics->AverageUclkActivity) <= 
> > SMU_14_0_2_BUSY_THRESHOLD)
> >               gpu_metrics->average_uclk_frequency = 
> > metrics->AverageMemclkFrequencyPostDs;
> >       else
> >               gpu_metrics->average_uclk_frequency = 
> > metrics->AverageMemclkFrequencyPreDs;
>

Reply via email to