When a RAS fatal error is detected, PMFW will only process priority
messages. Other messages won't be taken up for processing and therefore
won't get any response in such a state.

Add logic to filter out non-priority messages when RAS error is
detected. Also, don't poll response response status register before
sending priority messages. Use firmware capability flag to determine
whether to filter priority messages.

Signed-off-by: Lijo Lazar <lijo.la...@amd.com>
---
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 65 +++++++++++++++++++++++---
 1 file changed, 59 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
index 3227e514e8ae..6d1c3af927ca 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
@@ -235,6 +235,50 @@ static void __smu_cmn_send_msg(struct smu_context *smu,
        WREG32(smu->msg_reg, msg);
 }
 
+static inline uint32_t __smu_cmn_get_msg_flags(struct smu_context *smu,
+                                              enum smu_message_type msg)
+{
+       return smu->message_map[msg].flags;
+}
+
+static int __smu_cmn_ras_filter_msg(struct smu_context *smu,
+                                   enum smu_message_type msg, bool *poll)
+{
+       struct amdgpu_device *adev = smu->adev;
+       uint32_t flags, resp;
+       bool fed_status;
+
+       flags = __smu_cmn_get_msg_flags(smu, msg);
+       *poll = true;
+
+       /* When there is RAS fatal error, FW won't process non-RAS priority
+        * messages. Don't allow any messages other than RAS priority messages.
+        */
+       fed_status = amdgpu_ras_get_fed_status(adev);
+       if (fed_status) {
+               if (!(flags & SMU_MSG_RAS_PRI)) {
+                       dev_dbg(adev->dev,
+                               "RAS error detected, skip sending %s",
+                               smu_get_message_name(smu, msg));
+                       return -EACCES;
+               }
+
+               /* FW will ignore non-priority messages when a RAS fatal error
+                * is detected. Hence it is possible that a previous message
+                * wouldn't have got response. Allow to continue without polling
+                * for response status for priority messages.
+                */
+               resp = RREG32(smu->resp_reg);
+               dev_dbg(adev->dev,
+                       "Sending RAS priority message %s response status: %x",
+                       smu_get_message_name(smu, msg), resp);
+               if (resp == 0)
+                       *poll = false;
+       }
+
+       return 0;
+}
+
 static int __smu_cmn_send_debug_msg(struct smu_context *smu,
                               u32 msg,
                               u32 param)
@@ -354,6 +398,7 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
 {
        struct amdgpu_device *adev = smu->adev;
        int res, index;
+       bool poll = true;
        u32 reg;
 
        if (adev->no_hw_access)
@@ -366,12 +411,20 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context 
*smu,
                return index == -EACCES ? 0 : index;
 
        mutex_lock(&smu->message_lock);
-       reg = __smu_cmn_poll_stat(smu);
-       res = __smu_cmn_reg2errno(smu, reg);
-       if (reg == SMU_RESP_NONE ||
-           res == -EREMOTEIO) {
-               __smu_cmn_reg_print_error(smu, reg, index, param, msg);
-               goto Out;
+
+       if (smu->smc_fw_caps & SMU_FW_CAP_RAS_PRI) {
+               res = __smu_cmn_ras_filter_msg(smu, msg, &poll);
+               if (res)
+                       goto Out;
+       }
+
+       if (poll) {
+               reg = __smu_cmn_poll_stat(smu);
+               res = __smu_cmn_reg2errno(smu, reg);
+               if (reg == SMU_RESP_NONE || res == -EREMOTEIO) {
+                       __smu_cmn_reg_print_error(smu, reg, index, param, msg);
+                       goto Out;
+               }
        }
        __smu_cmn_send_msg(smu, (uint16_t) index, param);
        reg = __smu_cmn_poll_stat(smu);
-- 
2.25.1

Reply via email to