[AMD Official Use Only]


> -----Original Message-----
> From: Tuikov, Luben <luben.tui...@amd.com>
> Sent: Monday, July 12, 2021 11:31 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Tuikov, Luben <luben.tui...@amd.com>; Deucher, Alexander
> <alexander.deuc...@amd.com>; Quan, Evan <evan.q...@amd.com>
> Subject: [PATCH] drm/amd/pm: Fix a bug communicating with the SMU
> 
> This fixes a bug which if we probe a non-existing
> I2C device, and the SMU returns 0xFF, 
[Quan, Evan] Do we have to probe I2C device via issuing commands to SMU and 
check existence via SMU response?
Is there other more friendly way?
>from then on
> we can never communicate with the SMU, because the
> code before this patch reads and interprets 0xFF
> as a terminal error, and thus we never write 0
> into register 90 to clear the status (and
> subsequently send a new command to the SMU.)
> 
> It is not an error that the SMU returns status
> 0xFF. This means that the SMU executed the last
> command successfully (execution status), but the
> command result is an error of some sort (execution
> result), depending on what the command was.
> 
> When doing a status check of the SMU, before we
> send a new command, the only status which
> precludes us from sending a new command is 0--the
> SMU hasn't finished executing a previous command,
> and 0xFC--the SMU is busy.
> 
> This bug was seen as the following line in the
> kernel log,
> 
> amdgpu: Msg issuing pre-check failed(0xff) and SMU may be not in the right
> state!
[Quan, Evan] This was designed to prevent failure scenario from ruin.
Via this, we can prevent those SMU command/response related registers 
overwritten. 
Then PMFW team can known which command invoked the first error.

BR
Evan
> 
> when subsequent SMU commands, not necessarily
> related to I2C, were sent to the SMU.
> 
> This patch fixes this bug.
> 
> Cc: Alex Deucher <alexander.deuc...@amd.com>
> Cc: Evan Quan <evan.q...@amd.com>
> Fixes: fcb1fe9c9e0031 ("drm/amd/powerplay: pre-check the SMU state
> before issuing message")
> Signed-off-by: Luben Tuikov <luben.tui...@amd.com>
> ---
>  drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 196
> +++++++++++++++++++------
>  drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h |   3 +-
>  2 files changed, 152 insertions(+), 47 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> index c902fdf322c1be..775eb50a2e49a6 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> @@ -55,7 +55,7 @@
> 
>  #undef __SMU_DUMMY_MAP
>  #define __SMU_DUMMY_MAP(type)        #type
> -static const char* __smu_message_names[] = {
> +static const char * const __smu_message_names[] = {
>       SMU_MESSAGE_TYPES
>  };
> 
> @@ -76,46 +76,161 @@ static void smu_cmn_read_arg(struct smu_context
> *smu,
>       *arg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_82);
>  }
> 
> -int smu_cmn_wait_for_response(struct smu_context *smu)
> +/**
> + * __smu_cmn_poll_stat -- poll for a status from the SMU
> + * smu: a pointer to SMU context
> + *
> + * Returns the status of the SMU, which could be,
> + * 0, the SMU is busy with your previous command;
> + * 1,    execution status: success, execution result: success;
> + * 0xFF, execution status: success, execution result: failure;
> + * 0xFE, unknown command;
> + * 0xFD, valid command, but bad (command) prerequisites;
> + * 0xFC, the command was rejected as the SMU is busy;
> + * 0xFB, "SMC_Result_DebugDataDumpEnd".
> + */
> +static u32 __smu_cmn_poll_stat(struct smu_context *smu)
>  {
>       struct amdgpu_device *adev = smu->adev;
> -     uint32_t cur_value, i, timeout = adev->usec_timeout * 20;
> +     int timeout = adev->usec_timeout * 20;
> +     u32 reg;
> 
> -     for (i = 0; i < timeout; i++) {
> -             cur_value = RREG32_SOC15(MP1, 0,
> mmMP1_SMN_C2PMSG_90);
> -             if ((cur_value & MP1_C2PMSG_90__CONTENT_MASK) != 0)
> -                     return cur_value;
> +     for ( ; timeout > 0; timeout--) {
> +             reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
> +             if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0)
> +                     break;
> 
>               udelay(1);
>       }
> 
> -     /* timeout means wrong logic */
> -     if (i == timeout)
> -             return -ETIME;
> -
> -     return RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
> +     return reg;
>  }
> 
> -int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
> -                                  uint16_t msg, uint32_t param)
> +static void __smu_cmn_reg_print_error(struct smu_context *smu,
> +                                   u32 reg_c2pmsg_90,
> +                                   int msg_index,
> +                                   u32 param,
> +                                   enum smu_message_type msg)
>  {
>       struct amdgpu_device *adev = smu->adev;
> -     int ret;
> +     const char *message = smu_get_message_name(smu, msg);
> 
> -     ret = smu_cmn_wait_for_response(smu);
> -     if (ret != 0x1) {
> -             dev_err(adev->dev, "Msg issuing pre-check failed(0x%x) and
> "
> -                    "SMU may be not in the right state!\n", ret);
> -             if (ret != -ETIME)
> -                     ret = -EIO;
> -             return ret;
> +     switch (reg_c2pmsg_90) {
> +     case 0:
> +             dev_err_ratelimited(adev->dev,
> +                                 "SMU: I'm not done with your previous
> command!");
> +             break;
> +     case 1:
> +             /* The SMU executed the command. It completed with a
> +              * successful result.
> +              */
> +             break;
> +     case 0xFF:
> +             /* The SMU executed the command. It completed with a
> +              * unsuccessful result.
> +              */
> +             break;
> +     case 0xFE:
> +             dev_err_ratelimited(adev->dev,
> +                                 "SMU: unknown command: index:%d
> param:0x%08X message:%s",
> +                                 msg_index, param, message);
> +             break;
> +     case 0xFD:
> +             dev_err_ratelimited(adev->dev,
> +                                 "SMU: valid command, bad prerequisites:
> index:%d param:0x%08X message:%s",
> +                                 msg_index, param, message);
> +             break;
> +     case 0xFC:
> +             dev_err_ratelimited(adev->dev,
> +                                 "SMU: I'm very busy for your command:
> index:%d param:0x%08X message:%s",
> +                                 msg_index, param, message);
> +             break;
> +     case 0xFB:
> +             dev_err_ratelimited(adev->dev,
> +                                 "SMU: I'm debugging!");
> +             break;
> +     default:
> +             dev_err_ratelimited(adev->dev,
> +                                 "SMU: response:0x%08X for index:%d
> param:0x%08X message:%s?",
> +                                 reg_c2pmsg_90, msg_index, param,
> message);
> +             break;
> +     }
> +}
> +
> +static int __smu_cmn_reg2errno(struct smu_context *smu, u32
> reg_c2pmsg_90)
> +{
> +     int res;
> +
> +     switch (reg_c2pmsg_90) {
> +     case 0:
> +             res = -ETIME;
> +             break;
> +     case 1:
> +             res = 0;
> +             break;
> +     case 0xFF:
> +             res = -EIO;
> +             break;
> +     case 0xFE:
> +             res = -EOPNOTSUPP;
> +             break;
> +     case 0xFD:
> +             res = -EIO;
> +             break;
> +     case 0xFC:
> +             res = -EBUSY;
> +             break;
> +     case 0xFB:
> +             res = -EIO;
> +             break;
> +     default:
> +             res = -EIO;
> +             break;
>       }
> 
> +     return res;
> +}
> +
> +static void __smu_cmn_send_msg(struct smu_context *smu,
> +                            u16 msg,
> +                            u32 param)
> +{
> +     struct amdgpu_device *adev = smu->adev;
> +
>       WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90, 0);
>       WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_82, param);
>       WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_66, msg);
> +}
> 
> -     return 0;
> +int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
> +                                  uint16_t msg_index,
> +                                  uint32_t param)
> +{
> +     u32 reg;
> +     int res;
> +
> +     if (smu->adev->in_pci_err_recovery)
> +             return 0;
> +
> +     mutex_lock(&smu->message_lock);
> +     reg = __smu_cmn_poll_stat(smu);
> +     if (reg == 0 || reg == 0xFC) {
> +             res = __smu_cmn_reg2errno(smu, reg);
> +             goto Out;
> +     }
> +     __smu_cmn_send_msg(smu, msg_index, param);
> +     res = 0;
> +Out:
> +     mutex_unlock(&smu->message_lock);
> +     return res;
> +}
> +
> +int smu_cmn_wait_for_response(struct smu_context *smu)
> +{
> +     u32 reg;
> +
> +     reg = __smu_cmn_poll_stat(smu);
> +     return __smu_cmn_reg2errno(smu, reg);
>  }
> 
>  int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
> @@ -123,8 +238,8 @@ int smu_cmn_send_smc_msg_with_param(struct
> smu_context *smu,
>                                   uint32_t param,
>                                   uint32_t *read_arg)
>  {
> -     struct amdgpu_device *adev = smu->adev;
> -     int ret = 0, index = 0;
> +     int res, index;
> +     u32 reg;
> 
>       if (smu->adev->in_pci_err_recovery)
>               return 0;
> @@ -136,31 +251,20 @@ int smu_cmn_send_smc_msg_with_param(struct
> smu_context *smu,
>               return index == -EACCES ? 0 : index;
> 
>       mutex_lock(&smu->message_lock);
> -     ret = smu_cmn_send_msg_without_waiting(smu, (uint16_t)index,
> param);
> -     if (ret)
> -             goto out;
> -
> -     ret = smu_cmn_wait_for_response(smu);
> -     if (ret != 0x1) {
> -             if (ret == -ETIME) {
> -                     dev_err(adev->dev, "message: %15s (%d) \tparam:
> 0x%08x is timeout (no response)\n",
> -                             smu_get_message_name(smu, msg), index,
> param);
> -             } else {
> -                     dev_err(adev->dev, "failed send message: %15s (%d)
> \tparam: 0x%08x response %#x\n",
> -                             smu_get_message_name(smu, msg), index,
> param,
> -                             ret);
> -                     ret = -EIO;
> -             }
> -             goto out;
> +     reg = __smu_cmn_poll_stat(smu);
> +     if (reg == 0 || reg == 0xFC) {
> +             res = __smu_cmn_reg2errno(smu, reg);
> +             __smu_cmn_reg_print_error(smu, reg, index, param, msg);
> +             goto Out;
>       }
> -
> +     __smu_cmn_send_msg(smu, (uint16_t) index, param);
> +     reg = __smu_cmn_poll_stat(smu);
> +     res = __smu_cmn_reg2errno(smu, reg);
>       if (read_arg)
>               smu_cmn_read_arg(smu, read_arg);
> -
> -     ret = 0; /* 0 as driver return value */
> -out:
> +Out:
>       mutex_unlock(&smu->message_lock);
> -     return ret;
> +     return res;
>  }
> 
>  int smu_cmn_send_smc_msg(struct smu_context *smu,
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
> index 9add5f16ff562a..16993daa2ae042 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
> @@ -27,7 +27,8 @@
> 
>  #if defined(SWSMU_CODE_LAYER_L2) || defined(SWSMU_CODE_LAYER_L3)
> || defined(SWSMU_CODE_LAYER_L4)
>  int smu_cmn_send_msg_without_waiting(struct smu_context *smu,
> -                                  uint16_t msg, uint32_t param);
> +                                  uint16_t msg_index,
> +                                  uint32_t param);
>  int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
>                                   enum smu_message_type msg,
>                                   uint32_t param,
> --
> 2.32.0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to