Just realized that the patch I pasted won't work. Outer loop exit needs to be 
like this. 
        (reg & MP1_C2PMSG_90__CONTENT_MASK) != 0 && extended_wait-- >= 0

Anyway, that patch is only there to communicate what I really meant in the 
earlier comment.

Thanks,
Lijo

-----Original Message-----
From: amd-gfx <[email protected]> On Behalf Of Lazar, Lijo
Sent: Wednesday, December 1, 2021 10:44 AM
To: Yu, Lang <[email protected]>; [email protected]
Cc: Deucher, Alexander <[email protected]>; Huang, Ray 
<[email protected]>; Koenig, Christian <[email protected]>
Subject: Re: [PATCH] drm/amdgpu: add SMU debug option support



On 11/30/2021 10:47 AM, Lang Yu wrote:
> To maintain system error state when SMU errors occurred, which will 
> aid in debugging SMU firmware issues, add SMU debug option support.
> 
> It can be enabled or disabled via amdgpu_smu_debug debugfs file. When 
> enabled, it makes SMU errors fatal.
> It is disabled by default.
> 
> == Command Guide ==
> 
> 1, enable SMU debug option
> 
>   # echo 1 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
> 
> 2, disable SMU debug option
> 
>   # echo 0 > /sys/kernel/debug/dri/0/amdgpu_smu_debug
> 
> v2:
>   - Resend command when timeout.(Lijo)
>   - Use debugfs file instead of module parameter.
> 
> Signed-off-by: Lang Yu <[email protected]>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 32 +++++++++++++++++
>   drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c      | 39 +++++++++++++++++++--
>   2 files changed, 69 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index 164d6a9e9fbb..f9412de86599 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -39,6 +39,8 @@
>   
>   #if defined(CONFIG_DEBUG_FS)
>   
> +extern int amdgpu_smu_debug;
> +
>   /**
>    * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
>    *
> @@ -1152,6 +1154,8 @@ static ssize_t amdgpu_debugfs_gfxoff_read(struct file 
> *f, char __user *buf,
>       return result;
>   }
>   
> +
> +
>   static const struct file_operations amdgpu_debugfs_regs2_fops = {
>       .owner = THIS_MODULE,
>       .unlocked_ioctl = amdgpu_debugfs_regs2_ioctl, @@ -1609,6 +1613,26 
> @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
>   DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
>                       amdgpu_debugfs_sclk_set, "%llu\n");
>   
> +static int amdgpu_debugfs_smu_debug_get(void *data, u64 *val) {
> +     *val = amdgpu_smu_debug;
> +     return 0;
> +}
> +
> +static int amdgpu_debugfs_smu_debug_set(void *data, u64 val) {
> +     if (val != 0 && val != 1)
> +             return -EINVAL;
> +
> +     amdgpu_smu_debug = val;
> +     return 0;
> +}
> +
> +DEFINE_DEBUGFS_ATTRIBUTE(fops_smu_debug,
> +                      amdgpu_debugfs_smu_debug_get,
> +                      amdgpu_debugfs_smu_debug_set,
> +                      "%llu\n");
> +
>   int amdgpu_debugfs_init(struct amdgpu_device *adev)
>   {
>       struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
> @@ -1632,6 +1656,14 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
>               return PTR_ERR(ent);
>       }
>   
> +     ent = debugfs_create_file("amdgpu_smu_debug", 0600, root, adev,
> +                               &fops_smu_debug);
> +     if (IS_ERR(ent)) {
> +             DRM_ERROR("unable to create amdgpu_smu_debug debugsfs file\n");
> +             return PTR_ERR(ent);
> +     }
> +
> +
>       /* Register debugfs entries for amdgpu_ttm */
>       amdgpu_ttm_debugfs_init(adev);
>       amdgpu_debugfs_pm_init(adev);
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c 
> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> index 048ca1673863..b3969d7933d3 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> @@ -55,6 +55,14 @@
>   
>   #undef __SMU_DUMMY_MAP
>   #define __SMU_DUMMY_MAP(type)       #type
> +
> +/*
> + * Used to enable SMU debug option. When enabled, it makes SMU errors fatal.
> + * This will aid in debugging SMU firmware issues.
> + * (0 = disabled (default), 1 = enabled)  */ int amdgpu_smu_debug;
> +
>   static const char * const __smu_message_names[] = {
>       SMU_MESSAGE_TYPES
>   };
> @@ -272,6 +280,11 @@ int smu_cmn_send_msg_without_waiting(struct smu_context 
> *smu,
>       __smu_cmn_send_msg(smu, msg_index, param);
>       res = 0;
>   Out:
> +     if (unlikely(amdgpu_smu_debug == 1) && res) {
> +             mutex_unlock(&smu->message_lock);
> +             BUG();
> +     }
> +
>       return res;
>   }
>   
> @@ -288,9 +301,17 @@ int smu_cmn_send_msg_without_waiting(struct smu_context 
> *smu,
>   int smu_cmn_wait_for_response(struct smu_context *smu)
>   {
>       u32 reg;
> +     int res;
>   
>       reg = __smu_cmn_poll_stat(smu);
> -     return __smu_cmn_reg2errno(smu, reg);
> +     res = __smu_cmn_reg2errno(smu, reg);
> +
> +     if (unlikely(amdgpu_smu_debug == 1) && res) {
> +             mutex_unlock(&smu->message_lock);
> +             BUG();
> +     }
> +
> +     return res;
>   }
>   
>   /**
> @@ -328,6 +349,7 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context 
> *smu,
>                                   uint32_t param,
>                                   uint32_t *read_arg)
>   {
> +     int retry_count = 0;
>       int res, index;
>       u32 reg;
>   
> @@ -349,15 +371,28 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context 
> *smu,
>               __smu_cmn_reg_print_error(smu, reg, index, param, msg);
>               goto Out;
>       }
> +retry:
>       __smu_cmn_send_msg(smu, (uint16_t) index, param);
>       reg = __smu_cmn_poll_stat(smu);
>       res = __smu_cmn_reg2errno(smu, reg);
> -     if (res != 0)
> +     if (res != 0) {
>               __smu_cmn_reg_print_error(smu, reg, index, param, msg);
> +             if ((res == -ETIME) && (retry_count++ < 1)) {
> +                     usleep_range(500, 1000);
> +                     dev_err(smu->adev->dev,
> +                             "SMU: resend command: index:%d param:0x%08X 
> message:%s",
> +                             index, param, smu_get_message_name(smu, msg));
> +                     goto retry;
> +             }
> +             goto Out;
> +     }

Sorry, what I meant is to have an extended wait time in debug mode. 
Something like below, not a 'full retry' as in sending the message again.


+#define MAX_DBG_WAIT_CNT 3
+
  /**
   * __smu_cmn_poll_stat -- poll for a status from the SMU
   * smu: a pointer to SMU context
@@ -115,17 +117,24 @@ static void smu_cmn_read_arg(struct smu_context *smu,
  static u32 __smu_cmn_poll_stat(struct smu_context *smu)
  {
         struct amdgpu_device *adev = smu->adev;
-       int timeout = adev->usec_timeout * 20;
+       int timeout;
         u32 reg;
+       int extended_wait = smu_debug_mode ? MAX_DBG_WAIT_CNT : 0;

-       for ( ; timeout > 0; timeout--) {
-               reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
-               if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0)
-                       break;
+       do {
+               timeout = adev->usec_timeout * 20;
+               for (; timeout > 0; timeout--) {
+                       reg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
+                       if ((reg & MP1_C2PMSG_90__CONTENT_MASK) != 0)
+                               break;

-               udelay(1);
-       }
+                       udelay(1);
+               }
+       } while (extended_wait-- >= 0);

+       if (extended_wait != MAX_DBG_WAIT_CNT && reg != SMU_RESP_NONE)
+               dev_err(adev->dev,
+                       "SMU: Unexpected extended wait for response");
         return reg;
  }

Thanks,
Lijo

>       if (read_arg)
>               smu_cmn_read_arg(smu, read_arg);
>   Out:
>       mutex_unlock(&smu->message_lock);
> +
> +     BUG_ON(unlikely(amdgpu_smu_debug == 1) && res);
> +
>       return res;
>   }
>   
> 

Reply via email to