RE: [PATCH] drm/amdgpu: Update boot time errors polling sequence

2024-01-29 Thread Min, Frank
[AMD Official Use Only - General]

This Patch looks good to me.

Reviewed-By: Frank.Min 

-Original Message-
From: amd-gfx  On Behalf Of Hawking Zhang
Sent: Monday, January 29, 2024 10:35 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking 
Subject: [PATCH] drm/amdgpu: Update boot time errors polling sequence

Update boot time errors polling seqeunce to align with the latest firmware 
change.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 14 +-  
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  5 +
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 9e67355d4718..9b7a5c1c9af5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -4122,6 +4122,18 @@ static int amdgpu_ras_wait_for_boot_complete(struct 
amdgpu_device *adev,
u32 reg_data;
int retry_loop;

+   reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) +
+  aqua_vanjaram_encode_ext_smn_addressing(instance);
+
+   for (retry_loop = 0; retry_loop < AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; 
retry_loop++) {
+   reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
+   if ((reg_data & AMDGPU_RAS_BOOT_STATUS_MASK) == 
AMDGPU_RAS_BOOT_STEADY_STATUS) {
+   *boot_error = AMDGPU_RAS_BOOT_SUCEESS;
+   return 0;
+   }
+   msleep(1);
+   }
+
/* The pattern for smn addressing in other SOC could be different from
 * the one for aqua_vanjaram. We should revisit the code if the pattern
 * is changed. In such case, replace the aqua_vanjaram implementation 
@@ -4129,7 +4141,7 @@ static int amdgpu_ras_wait_for_boot_complete(struct 
amdgpu_device *adev,
reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) +
   aqua_vanjaram_encode_ext_smn_addressing(instance);

-   for (retry_loop = 0; retry_loop < 1000; retry_loop++) {
+   for (retry_loop = 0; retry_loop <
+AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; retry_loop++) {
reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
if (AMDGPU_RAS_GPU_ERR_BOOT_STATUS(reg_data)) {
*boot_error = reg_data;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 0b6ffae1e8bb..d10e5bb0e52f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -46,6 +46,11 @@ struct amdgpu_iv_entry;
 #define AMDGPU_RAS_GPU_ERR_HBM_ID(x)   AMDGPU_GET_REG_FIELD(x, 
13, 13)
 #define AMDGPU_RAS_GPU_ERR_BOOT_STATUS(x)  AMDGPU_GET_REG_FIELD(x, 
31, 31)

+#define AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT   1000
+#define AMDGPU_RAS_BOOT_STEADY_STATUS  0xBA
+#define AMDGPU_RAS_BOOT_STATUS_MASK0xFF
+#define AMDGPU_RAS_BOOT_SUCEESS0x8000
+
 #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS  (0x1 << 0)
 /* position of instance value in sub_block_index of
  * ta_ras_trigger_error_input, the sub block uses lower 12 bits
--
2.17.1



[PATCH] drm/amdgpu: Update boot time errors polling sequence

2024-01-29 Thread Hawking Zhang
Update boot time errors polling seqeunce to align with
the latest firmware change.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 14 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  5 +
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 9e67355d4718..9b7a5c1c9af5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -4122,6 +4122,18 @@ static int amdgpu_ras_wait_for_boot_complete(struct 
amdgpu_device *adev,
u32 reg_data;
int retry_loop;
 
+   reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) +
+  aqua_vanjaram_encode_ext_smn_addressing(instance);
+
+   for (retry_loop = 0; retry_loop < AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; 
retry_loop++) {
+   reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
+   if ((reg_data & AMDGPU_RAS_BOOT_STATUS_MASK) == 
AMDGPU_RAS_BOOT_STEADY_STATUS) {
+   *boot_error = AMDGPU_RAS_BOOT_SUCEESS;
+   return 0;
+   }
+   msleep(1);
+   }
+
/* The pattern for smn addressing in other SOC could be different from
 * the one for aqua_vanjaram. We should revisit the code if the pattern
 * is changed. In such case, replace the aqua_vanjaram implementation
@@ -4129,7 +4141,7 @@ static int amdgpu_ras_wait_for_boot_complete(struct 
amdgpu_device *adev,
reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) +
   aqua_vanjaram_encode_ext_smn_addressing(instance);
 
-   for (retry_loop = 0; retry_loop < 1000; retry_loop++) {
+   for (retry_loop = 0; retry_loop < AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; 
retry_loop++) {
reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
if (AMDGPU_RAS_GPU_ERR_BOOT_STATUS(reg_data)) {
*boot_error = reg_data;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 0b6ffae1e8bb..d10e5bb0e52f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -46,6 +46,11 @@ struct amdgpu_iv_entry;
 #define AMDGPU_RAS_GPU_ERR_HBM_ID(x)   AMDGPU_GET_REG_FIELD(x, 
13, 13)
 #define AMDGPU_RAS_GPU_ERR_BOOT_STATUS(x)  AMDGPU_GET_REG_FIELD(x, 
31, 31)
 
+#define AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT   1000
+#define AMDGPU_RAS_BOOT_STEADY_STATUS  0xBA
+#define AMDGPU_RAS_BOOT_STATUS_MASK0xFF
+#define AMDGPU_RAS_BOOT_SUCEESS0x8000
+
 #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS  (0x1 << 0)
 /* position of instance value in sub_block_index of
  * ta_ras_trigger_error_input, the sub block uses lower 12 bits
-- 
2.17.1