from:"Hawking Zhang"

[PATCH] drm/amdgpu: correct hbm field in boot status

2024-05-21 Thread Hawking Zhang

hbm filed takes bit 13 and bit 14 in boot status.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index c8980d5f6540..7021c4a66fb5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -46,7 +46,7 @@ struct amdgpu_iv_entry;
 #define AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(x)AMDGPU_GET_REG_FIELD(x, 
7, 7)
 #define AMDGPU_RAS_GPU_ERR_SOCKET_ID(x)
AMDGPU_GET_REG_FIELD(x, 10, 8)
 #define AMDGPU_RAS_GPU_ERR_AID_ID(x)   AMDGPU_GET_REG_FIELD(x, 
12, 11)
-#define AMDGPU_RAS_GPU_ERR_HBM_ID(x)   AMDGPU_GET_REG_FIELD(x, 
13, 13)
+#define AMDGPU_RAS_GPU_ERR_HBM_ID(x)   AMDGPU_GET_REG_FIELD(x, 
14, 13)
 #define AMDGPU_RAS_GPU_ERR_BOOT_STATUS(x)  AMDGPU_GET_REG_FIELD(x, 
31, 31)
 
 #define AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT   1000
-- 
2.17.1

[PATCH] drm/amdgpu: Use driver mode reset for data poison

2024-04-16 Thread Hawking Zhang

mode-2 reset is the only reliable method that can get
GC/SDMA back when poison is consumed. mmhub requires
mode-1 reset.

Signed-off-by: Hawking Zhang 
---
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   | 27 ++-
 1 file changed, 8 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index c368c70df3f4a..c3beb872adf8d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -144,7 +144,7 @@ static void event_interrupt_poison_consumption_v9(struct 
kfd_node *dev,
uint16_t pasid, uint16_t client_id)
 {
enum amdgpu_ras_block block = 0;
-   int old_poison, ret = -EINVAL;
+   int old_poison;
uint32_t reset = 0;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
 
@@ -163,17 +163,13 @@ static void event_interrupt_poison_consumption_v9(struct 
kfd_node *dev,
case SOC15_IH_CLIENTID_SE2SH:
case SOC15_IH_CLIENTID_SE3SH:
case SOC15_IH_CLIENTID_UTCL2:
-   ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
block = AMDGPU_RAS_BLOCK__GFX;
-   if (ret)
-   reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+   reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
break;
case SOC15_IH_CLIENTID_VMC:
case SOC15_IH_CLIENTID_VMC1:
-   ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
block = AMDGPU_RAS_BLOCK__MMHUB;
-   if (ret)
-   reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
+   reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
break;
case SOC15_IH_CLIENTID_SDMA0:
case SOC15_IH_CLIENTID_SDMA1:
@@ -184,22 +180,15 @@ static void event_interrupt_poison_consumption_v9(struct 
kfd_node *dev,
reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
break;
default:
-   break;
+   dev_warn(dev->adev->dev,
+"client %d does not support poison consumption\n", 
client_id);
+   return;
}
 
kfd_signal_poison_consumed_event(dev, pasid);
 
-   /* resetting queue passes, do page retirement without gpu reset
-* resetting queue fails, fallback to gpu reset solution
-*/
-   if (!ret)
-   dev_warn(dev->adev->dev,
-   "RAS poison consumption, unmap queue flow succeeded: 
client id %d\n",
-   client_id);
-   else
-   dev_warn(dev->adev->dev,
-   "RAS poison consumption, fall back to gpu reset flow: 
client id %d\n",
-   client_id);
+   dev_warn(dev->adev->dev,
+"poison is consumed by client %d, kick off gpu reset flow\n", 
client_id);
 
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);
 }
-- 
2.17.1

[PATCH] drm/amdgpu: Use driver mode reset for data poison handling

2024-04-15 Thread Hawking Zhang

mode-2 reset is the only reliable method that can get
GC/SDMA back when poison is consumed. mmhub requires
mode-1 reset.

Signed-off-by: Hawking Zhang 
---
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   | 22 +++
 1 file changed, 3 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index c368c70df3f4a..94eb2493103ef 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -144,7 +144,7 @@ static void event_interrupt_poison_consumption_v9(struct 
kfd_node *dev,
uint16_t pasid, uint16_t client_id)
 {
enum amdgpu_ras_block block = 0;
-   int old_poison, ret = -EINVAL;
+   int old_poison;
uint32_t reset = 0;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
 
@@ -163,17 +163,13 @@ static void event_interrupt_poison_consumption_v9(struct 
kfd_node *dev,
case SOC15_IH_CLIENTID_SE2SH:
case SOC15_IH_CLIENTID_SE3SH:
case SOC15_IH_CLIENTID_UTCL2:
-   ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
block = AMDGPU_RAS_BLOCK__GFX;
-   if (ret)
-   reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+   reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
break;
case SOC15_IH_CLIENTID_VMC:
case SOC15_IH_CLIENTID_VMC1:
-   ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
block = AMDGPU_RAS_BLOCK__MMHUB;
-   if (ret)
-   reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
+   reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
break;
case SOC15_IH_CLIENTID_SDMA0:
case SOC15_IH_CLIENTID_SDMA1:
@@ -189,18 +185,6 @@ static void event_interrupt_poison_consumption_v9(struct 
kfd_node *dev,
 
kfd_signal_poison_consumed_event(dev, pasid);
 
-   /* resetting queue passes, do page retirement without gpu reset
-* resetting queue fails, fallback to gpu reset solution
-*/
-   if (!ret)
-   dev_warn(dev->adev->dev,
-   "RAS poison consumption, unmap queue flow succeeded: 
client id %d\n",
-   client_id);
-   else
-   dev_warn(dev->adev->dev,
-   "RAS poison consumption, fall back to gpu reset flow: 
client id %d\n",
-   client_id);
-
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);
 }
 
-- 
2.17.1

[PATCH] drm/amdgpu: Use driver mode reset for data poison handling

2024-04-15 Thread Hawking Zhang

mode-2 reset is the only reliable method that can get
GC/SDMA back when poison is consumed. mmhub requires
mode-1 reset.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 8 ++--
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index c368c70df3f4a..b6caf6eda8a0c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -163,17 +163,13 @@ static void event_interrupt_poison_consumption_v9(struct 
kfd_node *dev,
case SOC15_IH_CLIENTID_SE2SH:
case SOC15_IH_CLIENTID_SE3SH:
case SOC15_IH_CLIENTID_UTCL2:
-   ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
block = AMDGPU_RAS_BLOCK__GFX;
-   if (ret)
-   reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+   reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
break;
case SOC15_IH_CLIENTID_VMC:
case SOC15_IH_CLIENTID_VMC1:
-   ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
block = AMDGPU_RAS_BLOCK__MMHUB;
-   if (ret)
-   reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
+   reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
break;
case SOC15_IH_CLIENTID_SDMA0:
case SOC15_IH_CLIENTID_SDMA1:
-- 
2.17.1

[PATCH] drm/amdgpu: Process bif doorbell event

2024-04-01 Thread Hawking Zhang

When BACO exit is triggered by doorbell transaction,
firmware will config bif to issue msi interrupt to
indicate doorbell transaction. If bif ring is not
enabled in such case, driver needs to ack the interrupt
by clearing the interrupt status.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h |  2 +
 drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c   | 56 
 drivers/gpu/drm/amd/amdgpu/soc21.c   | 14 +-
 3 files changed, 71 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
index 7b8c03be1d9e..db341921cfc1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
@@ -102,6 +102,7 @@ struct amdgpu_nbio_funcs {
u32 (*get_memory_partition_mode)(struct amdgpu_device *adev,
 u32 *supp_modes);
u64 (*get_pcie_replay_count)(struct amdgpu_device *adev);
+   u32 (*init_bif_doorbell_event)(struct amdgpu_device *adev);
 };
 
 struct amdgpu_nbio {
@@ -111,6 +112,7 @@ struct amdgpu_nbio {
struct ras_common_if *ras_if;
const struct amdgpu_nbio_funcs *funcs;
struct amdgpu_nbio_ras  *ras;
+   struct amdgpu_irq_src bif_doorbell_irq;
 };
 
 int amdgpu_nbio_ras_sw_init(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c 
b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c
index 7f88a298ac5f..e5a331b6eee9 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c
@@ -477,6 +477,61 @@ static void nbio_v4_3_program_aspm(struct amdgpu_device 
*adev)
 #endif
 }
 
+static int nbio_v4_3_set_bif_doorbell_irq_state(struct amdgpu_device *adev,
+   struct amdgpu_irq_src *src,
+   unsigned type,
+   enum amdgpu_interrupt_state 
state)
+{
+/*let firmware to config bif doorbell irq state */
+return 0;
+}
+
+static int nbio_v4_3_process_bif_doorbell_irq(struct amdgpu_device *adev,
+ struct amdgpu_irq_src *source,
+ struct amdgpu_iv_entry *entry)
+{
+   /* pmfw will config bif doorbell irq to host if baco exit
+* is triggered by doorbell transaction. In such case, driver
+* needs to clear the interrupt status */
+
+   uint32_t reg_data;
+
+   reg_data = RREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_RB_CNTL);
+
+   /* if bif ring is enabled, do nothing */
+   if (REG_GET_FIELD(reg_data, BIF_BX0_BIF_RB_CNTL, RB_ENABLE))
+   return 0;
+
+   /* write 1 to clear doorbell interrupt */
+   reg_data = RREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL);
+   if (REG_GET_FIELD(reg_data, BIF_BX0_BIF_DOORBELL_INT_CNTL, 
DOORBELL_INTERRUPT_STATUS)) {
+   reg_data = REG_SET_FIELD(reg_data,
+BIF_BX0_BIF_DOORBELL_INT_CNTL,
+DOORBELL_INTERRUPT_CLEAR, 1);
+   WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, 
reg_data);
+   }
+
+   return 0;
+}
+
+static const struct amdgpu_irq_src_funcs nbio_v4_3_bif_doorbell_irq_funcs = {
+   .set = nbio_v4_3_set_bif_doorbell_irq_state,
+   .process = nbio_v4_3_process_bif_doorbell_irq,
+};
+
+static u32 nbio_v4_3_init_bif_doorbell_event(struct amdgpu_device *adev)
+{
+   u32 r;
+
+   adev->nbio.bif_doorbell_irq.funcs = _v4_3_bif_doorbell_irq_funcs;
+   adev->nbio.bif_doorbell_irq.num_types = 1;
+
+   r = amdgpu_irq_add_id(adev, SOC21_IH_CLIENTID_BIF,
+ NBIF_7_4__SRCID__DOORBELL_INTERRUPT,
+ >nbio.bif_doorbell_irq);
+   return r;
+}
+
 const struct amdgpu_nbio_funcs nbio_v4_3_funcs = {
.get_hdp_flush_req_offset = nbio_v4_3_get_hdp_flush_req_offset,
.get_hdp_flush_done_offset = nbio_v4_3_get_hdp_flush_done_offset,
@@ -499,6 +554,7 @@ const struct amdgpu_nbio_funcs nbio_v4_3_funcs = {
.remap_hdp_registers = nbio_v4_3_remap_hdp_registers,
.get_rom_offset = nbio_v4_3_get_rom_offset,
.program_aspm = nbio_v4_3_program_aspm,
+   .init_bif_doorbell_event = nbio_v4_3_init_bif_doorbell_event,
 };
 
 
diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c 
b/drivers/gpu/drm/amd/amdgpu/soc21.c
index abe319b0f063..ee6d810589c0 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc21.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc21.c
@@ -792,6 +792,9 @@ static int soc21_common_late_init(void *handle)
 * nbio v4_3 only support fatal error hanlding
 * just enable the interrupt directly */
amdgpu_irq_get(adev, 
>nbio.ras_err_event_athub_irq, 0);
+   if (adev->nbio.bif_doorbell_irq.funcs &&
+   adev->nbio.

[PATCH] drm/amdgpu: Bypass asd if display hw is not available

2024-03-30 Thread Hawking Zhang

ASD is not needed by headless GPU.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 94b310fdb719d..83bf86352267d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1053,6 +1053,11 @@ static int psp_asd_initialize(struct psp_context *psp)
if (amdgpu_sriov_vf(psp->adev) || !psp->asd_context.bin_desc.size_bytes)
return 0;
 
+   /* bypass asd if display hardware is not available */
+   if (!amdgpu_device_has_display_hardware(psp->adev) &&
+   amdgpu_ip_version(psp->adev, MP0_HWIP, 0) >= IP_VERSION(13, 0, 10))
+   return 0;
+
psp->asd_context.mem_context.shared_mc_addr  = 0;
psp->asd_context.mem_context.shared_mem_size = PSP_ASD_SHARED_MEM_SIZE;
psp->asd_context.ta_load_type= GFX_CMD_ID_LOAD_ASD;
-- 
2.17.1

[PATCH] drm/amdgpu: Bypass asd if display hw is not available

2024-03-29 Thread Hawking Zhang

ASD is not needed by headless GPU.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 94b310fdb719d..063203865bbe2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1053,6 +1053,11 @@ static int psp_asd_initialize(struct psp_context *psp)
if (amdgpu_sriov_vf(psp->adev) || !psp->asd_context.bin_desc.size_bytes)
return 0;
 
+   /* bypass asd if display hardware is not available */
+   if (!amdgpu_device_has_display_hardware(psp->adev) &&
+   amdgpu_ip_version(adev, MP0_HWIP, 0) >= IP_VERSION(13, 0, 10))
+   return 0;
+
psp->asd_context.mem_context.shared_mc_addr  = 0;
psp->asd_context.mem_context.shared_mem_size = PSP_ASD_SHARED_MEM_SIZE;
psp->asd_context.ta_load_type= GFX_CMD_ID_LOAD_ASD;
-- 
2.17.1

[PATCH] drm/amdgpu: Bypass display ta if display hw is not available

2024-03-14 Thread Hawking Zhang

Do not load/invoke display TA if display hardware is not
available

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 867397fe2e9d..e7d7fd2cc31d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1830,6 +1830,10 @@ static int psp_hdcp_initialize(struct psp_context *psp)
if (amdgpu_sriov_vf(psp->adev))
return 0;
 
+   /* bypass hdcp initialization if dmu is harvested */
+   if (!amdgpu_device_has_display_hardware(psp->adev))
+   return 0;
+
if (!psp->hdcp_context.context.bin_desc.size_bytes ||
!psp->hdcp_context.context.bin_desc.start_addr) {
dev_info(psp->adev->dev, "HDCP: optional hdcp ta ucode is not 
available\n");
@@ -1862,6 +1866,9 @@ int psp_hdcp_invoke(struct psp_context *psp, uint32_t 
ta_cmd_id)
if (amdgpu_sriov_vf(psp->adev))
return 0;
 
+   if (!psp->hdcp_context.context.initialized)
+   return 0;
+
return psp_ta_invoke(psp, ta_cmd_id, >hdcp_context.context);
 }
 
@@ -1897,6 +1904,10 @@ static int psp_dtm_initialize(struct psp_context *psp)
if (amdgpu_sriov_vf(psp->adev))
return 0;
 
+   /* bypass dtm initialization if dmu is harvested */
+   if (!amdgpu_device_has_display_hardware(psp->adev))
+   return 0;
+
if (!psp->dtm_context.context.bin_desc.size_bytes ||
!psp->dtm_context.context.bin_desc.start_addr) {
dev_info(psp->adev->dev, "DTM: optional dtm ta ucode is not 
available\n");
@@ -1929,6 +1940,9 @@ int psp_dtm_invoke(struct psp_context *psp, uint32_t 
ta_cmd_id)
if (amdgpu_sriov_vf(psp->adev))
return 0;
 
+   if (!psp->dtm_context.context.initialized)
+   return 0;
+
return psp_ta_invoke(psp, ta_cmd_id, >dtm_context.context);
 }
 
@@ -2063,6 +2077,10 @@ static int psp_securedisplay_initialize(struct 
psp_context *psp)
if (amdgpu_sriov_vf(psp->adev))
return 0;
 
+   /* bypass securedisplay initialization if dmu is harvested */
+   if (!amdgpu_device_has_display_hardware(psp->adev))
+return 0;
+
if (!psp->securedisplay_context.context.bin_desc.size_bytes ||
!psp->securedisplay_context.context.bin_desc.start_addr) {
dev_info(psp->adev->dev, "SECUREDISPLAY: securedisplay ta ucode 
is not available\n");
-- 
2.17.1

[PATCH] drm/amdgpu: Bypass display ta if it is harvested

2024-03-14 Thread Hawking Zhang

Display TA doesn't need to be loaded/invoked if it
is harvested.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 867397fe2e9d..bb4988c45ca9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1830,6 +1830,10 @@ static int psp_hdcp_initialize(struct psp_context *psp)
if (amdgpu_sriov_vf(psp->adev))
return 0;
 
+   /* bypass hdcp initialization if dmu is harvested */
+   if (psp->adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)
+   return 0;
+
if (!psp->hdcp_context.context.bin_desc.size_bytes ||
!psp->hdcp_context.context.bin_desc.start_addr) {
dev_info(psp->adev->dev, "HDCP: optional hdcp ta ucode is not 
available\n");
@@ -1862,6 +1866,9 @@ int psp_hdcp_invoke(struct psp_context *psp, uint32_t 
ta_cmd_id)
if (amdgpu_sriov_vf(psp->adev))
return 0;
 
+   if (!psp->hdcp_context.context.initialized)
+   return 0;
+
return psp_ta_invoke(psp, ta_cmd_id, >hdcp_context.context);
 }
 
@@ -1897,6 +1904,10 @@ static int psp_dtm_initialize(struct psp_context *psp)
if (amdgpu_sriov_vf(psp->adev))
return 0;
 
+   /* bypass dtm initialization if dmu is harvested */
+   if (psp->adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)
+   return 0;
+
if (!psp->dtm_context.context.bin_desc.size_bytes ||
!psp->dtm_context.context.bin_desc.start_addr) {
dev_info(psp->adev->dev, "DTM: optional dtm ta ucode is not 
available\n");
@@ -1929,6 +1940,9 @@ int psp_dtm_invoke(struct psp_context *psp, uint32_t 
ta_cmd_id)
if (amdgpu_sriov_vf(psp->adev))
return 0;
 
+   if (!psp->dtm_context.context.initialized)
+   return 0;
+
return psp_ta_invoke(psp, ta_cmd_id, >dtm_context.context);
 }
 
@@ -2063,6 +2077,10 @@ static int psp_securedisplay_initialize(struct 
psp_context *psp)
if (amdgpu_sriov_vf(psp->adev))
return 0;
 
+   /* bypass securedisplay initialization if dmu is harvested */
+   if (psp->adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)
+return 0;
+
if (!psp->securedisplay_context.context.bin_desc.size_bytes ||
!psp->securedisplay_context.context.bin_desc.start_addr) {
dev_info(psp->adev->dev, "SECUREDISPLAY: securedisplay ta ucode 
is not available\n");
-- 
2.17.1

[PATCH] drm/amdgpu: Do not enable/disable bif ras irq from guest

2024-02-17 Thread Hawking Zhang

Only do this from host side.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/soc15.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c 
b/drivers/gpu/drm/amd/amdgpu/soc15.c
index 15033efec2ba..2c8702560090 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -1278,7 +1278,8 @@ static int soc15_common_hw_fini(void *handle)
if (amdgpu_sriov_vf(adev))
xgpu_ai_mailbox_put_irq(adev);
 
-   if (adev->nbio.ras_if &&
+   if ((!amdgpu_sriov_vf(adev)) &&
+   adev->nbio.ras_if &&
amdgpu_ras_is_supported(adev, adev->nbio.ras_if->block)) {
if (adev->nbio.ras &&
adev->nbio.ras->init_ras_controller_interrupt)
-- 
2.17.1

[PATCH] drm/amdgpu: Update boot time errors polling sequence

2024-01-29 Thread Hawking Zhang

Update boot time errors polling seqeunce to align with
the latest firmware change.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 14 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  5 +
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 9e67355d4718..9b7a5c1c9af5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -4122,6 +4122,18 @@ static int amdgpu_ras_wait_for_boot_complete(struct 
amdgpu_device *adev,
u32 reg_data;
int retry_loop;
 
+   reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) +
+  aqua_vanjaram_encode_ext_smn_addressing(instance);
+
+   for (retry_loop = 0; retry_loop < AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; 
retry_loop++) {
+   reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
+   if ((reg_data & AMDGPU_RAS_BOOT_STATUS_MASK) == 
AMDGPU_RAS_BOOT_STEADY_STATUS) {
+   *boot_error = AMDGPU_RAS_BOOT_SUCEESS;
+   return 0;
+   }
+   msleep(1);
+   }
+
/* The pattern for smn addressing in other SOC could be different from
 * the one for aqua_vanjaram. We should revisit the code if the pattern
 * is changed. In such case, replace the aqua_vanjaram implementation
@@ -4129,7 +4141,7 @@ static int amdgpu_ras_wait_for_boot_complete(struct 
amdgpu_device *adev,
reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) +
   aqua_vanjaram_encode_ext_smn_addressing(instance);
 
-   for (retry_loop = 0; retry_loop < 1000; retry_loop++) {
+   for (retry_loop = 0; retry_loop < AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; 
retry_loop++) {
reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
if (AMDGPU_RAS_GPU_ERR_BOOT_STATUS(reg_data)) {
*boot_error = reg_data;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 0b6ffae1e8bb..d10e5bb0e52f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -46,6 +46,11 @@ struct amdgpu_iv_entry;
 #define AMDGPU_RAS_GPU_ERR_HBM_ID(x)   AMDGPU_GET_REG_FIELD(x, 
13, 13)
 #define AMDGPU_RAS_GPU_ERR_BOOT_STATUS(x)  AMDGPU_GET_REG_FIELD(x, 
31, 31)
 
+#define AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT   1000
+#define AMDGPU_RAS_BOOT_STEADY_STATUS  0xBA
+#define AMDGPU_RAS_BOOT_STATUS_MASK0xFF
+#define AMDGPU_RAS_BOOT_SUCEESS0x8000
+
 #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS  (0x1 << 0)
 /* position of instance value in sub_block_index of
  * ta_ras_trigger_error_input, the sub block uses lower 12 bits
-- 
2.17.1

[PATCH] drm/amdgpu: Fix null pointer dereference

2024-01-22 Thread Hawking Zhang

amdgpu_reg_state_sysfs_fini could be invoked at the
time when asic_func is even not initialized, i.e.,
amdgpu_discovery_init fails for some reason.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/include/amdgpu_reg_state.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/include/amdgpu_reg_state.h 
b/drivers/gpu/drm/amd/include/amdgpu_reg_state.h
index be519c8edf49..335980e2afbf 100644
--- a/drivers/gpu/drm/amd/include/amdgpu_reg_state.h
+++ b/drivers/gpu/drm/amd/include/amdgpu_reg_state.h
@@ -138,7 +138,7 @@ static inline size_t amdgpu_reginst_size(uint16_t num_inst, 
size_t inst_size,
 }
 
 #define amdgpu_asic_get_reg_state_supported(adev) \
-   ((adev)->asic_funcs->get_reg_state ? 1 : 0)
+   (((adev)->asic_funcs && (adev)->asic_funcs->get_reg_state) ? 1 : 0)
 
 #define amdgpu_asic_get_reg_state(adev, state, buf, size)  \
((adev)->asic_funcs->get_reg_state ?   \
-- 
2.17.1

[PATCH v2 v2 4/5] drm/amdgpu: Query boot status if discovery failed

2024-01-07 Thread Hawking Zhang

Check and report boot status if discovery failed.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index b8fde08aec8e..302b71e9f1e2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -27,6 +27,7 @@
 #include "amdgpu_discovery.h"
 #include "soc15_hw_ip.h"
 #include "discovery.h"
+#include "amdgpu_ras.h"
 
 #include "soc15.h"
 #include "gfx_v9_0.h"
@@ -98,6 +99,7 @@
 #define FIRMWARE_IP_DISCOVERY "amdgpu/ip_discovery.bin"
 MODULE_FIRMWARE(FIRMWARE_IP_DISCOVERY);
 
+#define mmIP_DISCOVERY_VERSION  0x16A00
 #define mmRCC_CONFIG_MEMSIZE   0xde3
 #define mmMP0_SMN_C2PMSG_330x16061
 #define mmMM_INDEX 0x0
@@ -518,7 +520,9 @@ static int amdgpu_discovery_init(struct amdgpu_device *adev)
 out:
kfree(adev->mman.discovery_bin);
adev->mman.discovery_bin = NULL;
-
+   if ((amdgpu_discovery != 2) &&
+   (RREG32(mmIP_DISCOVERY_VERSION) == 4))
+   amdgpu_ras_query_boot_status(adev, 4);
return r;
 }
 
-- 
2.17.1

[PATCH v2 v2 3/5] drm/amdgpu: Add ras helper to query boot errors v2

2024-01-07 Thread Hawking Zhang

Add ras helper function to query boot time gpu
errors.
v2: use aqua_vanjaram smn addressing pattern

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 95 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 15 +++-
 3 files changed, 110 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 9da14436a373..df3aa69be425 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1330,6 +1330,7 @@ int emu_soc_asic_init(struct amdgpu_device *adev);
 #define WREG32_FIELD_OFFSET(reg, offset, field, val)   \
WREG32(mm##reg + offset, (RREG32(mm##reg + offset) & 
~REG_FIELD_MASK(reg, field)) | (val) << REG_FIELD_SHIFT(reg, field))
 
+#define AMDGPU_GET_REG_FIELD(x, h, l) (((x) & GENMASK_ULL(h, l)) >> (l))
 /*
  * BIOS helpers.
  */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index fc42fb6ee191..a901b00d4949 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3763,3 +3763,98 @@ int amdgpu_ras_error_statistic_ce_count(struct 
ras_err_data *err_data,
 
return 0;
 }
+
+#define mmMP0_SMN_C2PMSG_920x1609C
+#define mmMP0_SMN_C2PMSG_126   0x160BE
+static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev,
+u32 instance, u32 boot_error)
+{
+   u32 socket_id, aid_id, hbm_id;
+   u32 reg_data;
+   u64 reg_addr;
+
+   socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error);
+   aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error);
+   hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error);
+
+   /* The pattern for smn addressing in other SOC could be different from
+* the one for aqua_vanjaram. We should revisit the code if the pattern
+* is changed. In such case, replace the aqua_vanjaram implementation
+* with more common helper */
+   reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) +
+  aqua_vanjaram_encode_ext_smn_addressing(instance);
+
+   reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
+   dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw 
status is 0x%x\n",
+   socket_id, aid_id, reg_data);
+
+   if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory 
training failed\n",
+socket_id, aid_id, hbm_id);
+
+   if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed 
at boot time\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, wafl link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm memory 
test failed\n",
+socket_id, aid_id, hbm_id);
+
+   if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm bist 
test failed\n",
+socket_id, aid_id, hbm_id);
+}
+
+static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev,
+u32 instance, u32 *boot_error)
+{
+   u32 reg_addr;
+   u32 reg_data;
+   int retry_loop;
+
+   /* The pattern for smn addressing in other SOC could be different from
+* the one for aqua_vanjaram. We should revisit the code if the pattern
+* is changed. In such case, replace the aqua_vanjaram implementation
+* with more common helper */
+   reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) +
+  aqua_vanjaram_encode_ext_smn_addressing(instance);
+
+   for (retry_loop = 0; retry_loop < 1000; retry_loop++) {
+   reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
+   if (AMDGPU_RAS_GPU_ERR_BOOT_STATUS(reg_data)) {
+

[PATCH v2 v2 2/5] drm/amdgpu: Init pcie_index/data address as fallback (v2)

2024-01-07 Thread Hawking Zhang

To allow using this helper for indirect access when
nbio funcs is not available. For instance, in ip
discovery phase.

v2: define macro for pcie_index/data/index_hi fallback.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 23 +-
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index abad5773714c..05d7cdcf28b0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -96,6 +96,9 @@ MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
 #define AMDGPU_RESUME_MS   2000
 #define AMDGPU_MAX_RETRY_LIMIT 2
 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) 
== -EINVAL)
+#define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2)
+#define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2)
+#define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2)
 
 static const struct drm_driver amdgpu_kms_driver;
 
@@ -781,12 +784,22 @@ u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device 
*adev,
void __iomem *pcie_index_hi_offset;
void __iomem *pcie_data_offset;
 
-   pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
-   pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
-   if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
-   pcie_index_hi = 
adev->nbio.funcs->get_pcie_index_hi_offset(adev);
-   else
+   if (unlikely(!adev->nbio.funcs)) {
+   pcie_index = AMDGPU_PCIE_INDEX_FALLBACK;
+   pcie_data = AMDGPU_PCIE_DATA_FALLBACK;
+   } else {
+   pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
+   pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
+   }
+
+   if (reg_addr >> 32) {
+   if (unlikely(!adev->nbio.funcs))
+   pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK;
+   else
+   pcie_index_hi = 
adev->nbio.funcs->get_pcie_index_hi_offset(adev);
+   } else {
pcie_index_hi = 0;
+   }
 
spin_lock_irqsave(>pcie_idx_lock, flags);
pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
-- 
2.17.1

[PATCH 5/5] drm/amdgpu: Query boot status if boot failed

2024-01-07 Thread Hawking Zhang

Check and report firmware boot status if it doesn't
reach steady status.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index 6fad451a85be..676bec2cc157 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -187,11 +187,18 @@ static int psp_v13_0_wait_for_bootloader(struct 
psp_context *psp)
 static int psp_v13_0_wait_for_bootloader_steady_state(struct psp_context *psp)
 {
struct amdgpu_device *adev = psp->adev;
+   int ret;
 
if (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6)) {
-   psp_v13_0_wait_for_vmbx_ready(psp);
+   ret = psp_v13_0_wait_for_vmbx_ready(psp);
+   if (ret)
+   amdgpu_ras_query_boot_status(adev, 4);
+
+   ret = psp_v13_0_wait_for_bootloader(psp);
+   if (ret)
+   amdgpu_ras_query_boot_status(adev, 4);
 
-   return psp_v13_0_wait_for_bootloader(psp);
+   return ret;
}
 
return 0;
-- 
2.17.1

[PATCH 4/5] drm/amdgpu: Query boot status if discovery failed

2024-01-07 Thread Hawking Zhang

Check and report boot status if discovery failed.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index b8fde08aec8e..302b71e9f1e2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -27,6 +27,7 @@
 #include "amdgpu_discovery.h"
 #include "soc15_hw_ip.h"
 #include "discovery.h"
+#include "amdgpu_ras.h"
 
 #include "soc15.h"
 #include "gfx_v9_0.h"
@@ -98,6 +99,7 @@
 #define FIRMWARE_IP_DISCOVERY "amdgpu/ip_discovery.bin"
 MODULE_FIRMWARE(FIRMWARE_IP_DISCOVERY);
 
+#define mmIP_DISCOVERY_VERSION  0x16A00
 #define mmRCC_CONFIG_MEMSIZE   0xde3
 #define mmMP0_SMN_C2PMSG_330x16061
 #define mmMM_INDEX 0x0
@@ -518,7 +520,9 @@ static int amdgpu_discovery_init(struct amdgpu_device *adev)
 out:
kfree(adev->mman.discovery_bin);
adev->mman.discovery_bin = NULL;
-
+   if ((amdgpu_discovery != 2) &&
+   (RREG32(mmIP_DISCOVERY_VERSION) == 4))
+   amdgpu_ras_query_boot_status(adev, 4);
return r;
 }
 
-- 
2.17.1

[PATCH 1/5] drm/amdgpu: drop psp v13 query_boot_status implementation

2024-01-07 Thread Hawking Zhang

Will replace it with new implementation to cover
boot fails in ip discovery phase.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c| 15 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h|  4 --
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 78 --
 4 files changed, 99 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index a39c9fea55c4..abad5773714c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1218,8 +1218,6 @@ static int amdgpu_device_asic_init(struct amdgpu_device 
*adev)
amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) {
amdgpu_psp_wait_for_bootloader(adev);
ret = amdgpu_atomfirmware_asic_init(adev, true);
-   /* TODO: check the return val and stop device initialization if 
boot fails */
-   amdgpu_psp_query_boot_status(adev);
return ret;
} else {
return amdgpu_atom_asic_init(adev->mode_info.atom_context);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 2addbdf88394..90451cabb919 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -2125,21 +2125,6 @@ int amdgpu_psp_wait_for_bootloader(struct amdgpu_device 
*adev)
return ret;
 }
 
-int amdgpu_psp_query_boot_status(struct amdgpu_device *adev)
-{
-   struct psp_context *psp = >psp;
-   int ret = 0;
-
-   if (amdgpu_sriov_vf(adev) || (adev->flags & AMD_IS_APU))
-   return 0;
-
-   if (psp->funcs &&
-   psp->funcs->query_boot_status)
-   ret = psp->funcs->query_boot_status(psp);
-
-   return ret;
-}
-
 static int psp_hw_start(struct psp_context *psp)
 {
struct amdgpu_device *adev = psp->adev;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index c4d9cbde55b9..09d1f8f72a9c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -134,7 +134,6 @@ struct psp_funcs {
int (*update_spirom)(struct psp_context *psp, uint64_t fw_pri_mc_addr);
int (*vbflash_stat)(struct psp_context *psp);
int (*fatal_error_recovery_quirk)(struct psp_context *psp);
-   int (*query_boot_status)(struct psp_context *psp);
 };
 
 struct ta_funcs {
@@ -538,7 +537,4 @@ int psp_spatial_partition(struct psp_context *psp, int 
mode);
 int is_psp_fw_valid(struct psp_bin_desc bin);
 
 int amdgpu_psp_wait_for_bootloader(struct amdgpu_device *adev);
-
-int amdgpu_psp_query_boot_status(struct amdgpu_device *adev);
-
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index df1844d0800f..6fad451a85be 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -763,83 +763,6 @@ static int psp_v13_0_fatal_error_recovery_quirk(struct 
psp_context *psp)
return 0;
 }
 
-
-static void psp_v13_0_boot_error_reporting(struct amdgpu_device *adev,
-  uint32_t inst,
-  uint32_t boot_error)
-{
-   uint32_t socket_id;
-   uint32_t aid_id;
-   uint32_t hbm_id;
-   uint32_t reg_data;
-
-   socket_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, SOCKET_ID);
-   aid_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, AID_ID);
-   hbm_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, HBM_ID);
-
-   reg_data = RREG32_SOC15(MP0, inst, regMP0_SMN_C2PMSG_109);
-   dev_info(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw 
status is 0x%x\n",
-socket_id, aid_id, reg_data);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_MEM_TRAINING))
-   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory 
training failed\n",
-socket_id, aid_id, hbm_id);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_FW_LOAD))
-   dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed 
at boot time\n",
-socket_id, aid_id);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_WAFL_LINK_TRAINING))
-   dev_info(adev->dev, "socket: %d, aid: %d, wafl link training 
failed\n",
-socket_id, aid_id);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_XGMI_LINK_TRAINING))
-   dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training 
failed\n",
-socket_id, aid_id);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_USR_

[PATCH v2 2/3] drm/amdgpu: Query ras capablity from psp v2

2024-01-02 Thread Hawking Zhang

Instead of traditional atomfirmware interfaces for RAS
capability, host driver can query ras capability from
psp starting from psp v13_0_6.

v2: drop redundant local variable from get_ras_capability.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 10 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h |  2 ++
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c  | 26 +
 3 files changed, 38 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 0dc8686e54f7..af3bc36aef18 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -2128,6 +2128,16 @@ int amdgpu_psp_wait_for_bootloader(struct amdgpu_device 
*adev)
return ret;
 }
 
+bool amdgpu_psp_get_ras_capability(struct psp_context *psp)
+{
+   if (psp->funcs &&
+   psp->funcs->get_ras_capability) {
+   return psp->funcs->get_ras_capability(psp);
+   } else {
+   return false;
+   }
+}
+
 static int psp_hw_start(struct psp_context *psp)
 {
struct amdgpu_device *adev = psp->adev;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index 09d1f8f72a9c..652b0a01854a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -134,6 +134,7 @@ struct psp_funcs {
int (*update_spirom)(struct psp_context *psp, uint64_t fw_pri_mc_addr);
int (*vbflash_stat)(struct psp_context *psp);
int (*fatal_error_recovery_quirk)(struct psp_context *psp);
+   bool (*get_ras_capability)(struct psp_context *psp);
 };
 
 struct ta_funcs {
@@ -537,4 +538,5 @@ int psp_spatial_partition(struct psp_context *psp, int 
mode);
 int is_psp_fw_valid(struct psp_bin_desc bin);
 
 int amdgpu_psp_wait_for_bootloader(struct amdgpu_device *adev);
+bool amdgpu_psp_get_ras_capability(struct psp_context *psp);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index 676bec2cc157..722b6066ce07 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -27,6 +27,7 @@
 #include "amdgpu_ucode.h"
 #include "soc15_common.h"
 #include "psp_v13_0.h"
+#include "amdgpu_ras.h"
 
 #include "mp/mp_13_0_2_offset.h"
 #include "mp/mp_13_0_2_sh_mask.h"
@@ -770,6 +771,30 @@ static int psp_v13_0_fatal_error_recovery_quirk(struct 
psp_context *psp)
return 0;
 }
 
+static bool psp_v13_0_get_ras_capability(struct psp_context *psp)
+{
+   struct amdgpu_device *adev = psp->adev;
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   u32 reg_data;
+
+   /* query ras cap should be done from host side */
+   if (amdgpu_sriov_vf(adev))
+   return false;
+
+   if (!con)
+   return false;
+
+   if ((amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6)) &&
+   (!(adev->flags & AMD_IS_APU))) {
+   reg_data = RREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_127);
+   adev->ras_hw_enabled = (reg_data & GENMASK_ULL(23, 0));
+   con->poison_supported = ((reg_data & GENMASK_ULL(24, 24)) >> 
24) ? true : false;
+   return true;
+   } else {
+   return false;
+   }
+}
+
 static const struct psp_funcs psp_v13_0_funcs = {
.init_microcode = psp_v13_0_init_microcode,
.wait_for_bootloader = psp_v13_0_wait_for_bootloader_steady_state,
@@ -792,6 +817,7 @@ static const struct psp_funcs psp_v13_0_funcs = {
.update_spirom = psp_v13_0_update_spirom,
.vbflash_stat = psp_v13_0_vbflash_status,
.fatal_error_recovery_quirk = psp_v13_0_fatal_error_recovery_quirk,
+   .get_ras_capability = psp_v13_0_get_ras_capability,
 };
 
 void psp_v13_0_set_psp_funcs(struct psp_context *psp)
-- 
2.17.1

[PATCH 3/3] drm/amdgpu: Centralize ras cap query to amdgpu_ras_check_supported

2024-01-02 Thread Hawking Zhang

Move ras capablity check to amdgpu_ras_check_supported.
Driver will query ras capablity through psp interace, or
vbios interface, or specific ip callbacks.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 170 +---
 1 file changed, 93 insertions(+), 77 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index a901b00d4949..2ee82baaf7d6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -39,6 +39,7 @@
 #include "nbio_v7_9.h"
 #include "atom.h"
 #include "amdgpu_reset.h"
+#include "amdgpu_psp.h"
 
 #ifdef CONFIG_X86_MCE_AMD
 #include 
@@ -2680,6 +2681,87 @@ static void amdgpu_ras_get_quirks(struct amdgpu_device 
*adev)
adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX);
 }
 
+/* Query ras capablity via atomfirmware interface */
+static void amdgpu_ras_query_ras_capablity_from_vbios(struct amdgpu_device 
*adev)
+{
+   /* mem_ecc cap */
+   if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
+   dev_info(adev->dev, "MEM ECC is active.\n");
+   adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC |
+1 << AMDGPU_RAS_BLOCK__DF);
+   } else {
+   dev_info(adev->dev, "MEM ECC is not presented.\n");
+   }
+
+   /* sram_ecc cap */
+   if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
+   dev_info(adev->dev, "SRAM ECC is active.\n");
+   if (!amdgpu_sriov_vf(adev))
+   adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
+ 1 << AMDGPU_RAS_BLOCK__DF);
+   else
+   adev->ras_hw_enabled |= (1 << 
AMDGPU_RAS_BLOCK__PCIE_BIF |
+1 << AMDGPU_RAS_BLOCK__SDMA |
+1 << AMDGPU_RAS_BLOCK__GFX);
+
+   /*
+* VCN/JPEG RAS can be supported on both bare metal and
+* SRIOV environment
+*/
+   if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(2, 6, 0) 
||
+   amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 0) 
||
+   amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 3))
+   adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN |
+1 << AMDGPU_RAS_BLOCK__JPEG);
+   else
+   adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN |
+ 1 << AMDGPU_RAS_BLOCK__JPEG);
+
+   /*
+* XGMI RAS is not supported if xgmi num physical nodes
+* is zero
+*/
+   if (!adev->gmc.xgmi.num_physical_nodes)
+   adev->ras_hw_enabled &= ~(1 << 
AMDGPU_RAS_BLOCK__XGMI_WAFL);
+   } else {
+   dev_info(adev->dev, "SRAM ECC is not presented.\n");
+   }
+}
+
+/* Query poison mode from umc/df IP callbacks */
+static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev)
+{
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   bool df_poison, umc_poison;
+
+   /* poison setting is useless on SRIOV guest */
+   if (amdgpu_sriov_vf(adev) || !con)
+   return;
+
+   /* Init poison supported flag, the default value is false */
+   if (adev->gmc.xgmi.connected_to_cpu ||
+   adev->gmc.is_app_apu) {
+   /* enabled by default when GPU is connected to CPU */
+   con->poison_supported = true;
+   } else if (adev->df.funcs &&
+   adev->df.funcs->query_ras_poison_mode &&
+   adev->umc.ras &&
+   adev->umc.ras->query_ras_poison_mode) {
+   df_poison =
+   adev->df.funcs->query_ras_poison_mode(adev);
+   umc_poison =
+   adev->umc.ras->query_ras_poison_mode(adev);
+
+   /* Only poison is set in both DF and UMC, we can support it */
+   if (df_poison && umc_poison)
+   con->poison_supported = true;
+   else if (df_poison != umc_poison)
+   dev_warn(adev->dev,
+   "Poison setting is inconsistent in 
DF/UMC(%d:%d)!\n",
+   df_poison, umc_poison);
+   }
+}
+
 /*
  * check hardware's ras ability which will be saved in hw_supported.
  * if hardware does not support ras, we can skip some ras initializtion and
@@ -2696,4

[PATCH 1/3] drm/amdgpu: Align ras block enum with firmware

2024-01-02 Thread Hawking Zhang

Driver and firmware share the same ras block enum.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 5785b705c692..8b053602c5ca 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -70,6 +70,8 @@ enum amdgpu_ras_block {
AMDGPU_RAS_BLOCK__MCA,
AMDGPU_RAS_BLOCK__VCN,
AMDGPU_RAS_BLOCK__JPEG,
+   AMDGPU_RAS_BLOCK__IH,
+   AMDGPU_RAS_BLOCK__MPIO,
 
AMDGPU_RAS_BLOCK__LAST
 };
-- 
2.17.1

[PATCH v2 3/5] drm/amdgpu: Add ras helper to query boot errors v2

2024-01-02 Thread Hawking Zhang

Add ras helper function to query boot time gpu
errors.
v2: use aqua_vanjaram smn addressing pattern

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 95 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 15 +++-
 3 files changed, 110 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 616b6c911767..cd91533d641c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1328,6 +1328,7 @@ int emu_soc_asic_init(struct amdgpu_device *adev);
 #define WREG32_FIELD_OFFSET(reg, offset, field, val)   \
WREG32(mm##reg + offset, (RREG32(mm##reg + offset) & 
~REG_FIELD_MASK(reg, field)) | (val) << REG_FIELD_SHIFT(reg, field))
 
+#define AMDGPU_GET_REG_FIELD(x, h, l) (((x) & GENMASK_ULL(h, l)) >> (l))
 /*
  * BIOS helpers.
  */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index fc42fb6ee191..a901b00d4949 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3763,3 +3763,98 @@ int amdgpu_ras_error_statistic_ce_count(struct 
ras_err_data *err_data,
 
return 0;
 }
+
+#define mmMP0_SMN_C2PMSG_920x1609C
+#define mmMP0_SMN_C2PMSG_126   0x160BE
+static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev,
+u32 instance, u32 boot_error)
+{
+   u32 socket_id, aid_id, hbm_id;
+   u32 reg_data;
+   u64 reg_addr;
+
+   socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error);
+   aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error);
+   hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error);
+
+   /* The pattern for smn addressing in other SOC could be different from
+* the one for aqua_vanjaram. We should revisit the code if the pattern
+* is changed. In such case, replace the aqua_vanjaram implementation
+* with more common helper */
+   reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) +
+  aqua_vanjaram_encode_ext_smn_addressing(instance);
+
+   reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
+   dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw 
status is 0x%x\n",
+   socket_id, aid_id, reg_data);
+
+   if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory 
training failed\n",
+socket_id, aid_id, hbm_id);
+
+   if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed 
at boot time\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, wafl link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm memory 
test failed\n",
+socket_id, aid_id, hbm_id);
+
+   if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm bist 
test failed\n",
+socket_id, aid_id, hbm_id);
+}
+
+static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev,
+u32 instance, u32 *boot_error)
+{
+   u32 reg_addr;
+   u32 reg_data;
+   int retry_loop;
+
+   /* The pattern for smn addressing in other SOC could be different from
+* the one for aqua_vanjaram. We should revisit the code if the pattern
+* is changed. In such case, replace the aqua_vanjaram implementation
+* with more common helper */
+   reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) +
+  aqua_vanjaram_encode_ext_smn_addressing(instance);
+
+   for (retry_loop = 0; retry_loop < 1000; retry_loop++) {
+   reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
+   if (AMDGPU_RAS_GPU_ERR_BOOT_STATUS(reg_data)) {
+

[PATCH 5/5] drm/amdgpu: Query boot status if boot failed

2024-01-02 Thread Hawking Zhang

Check and report firmware boot status if it doesn't
reach steady status.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index 6fad451a85be..676bec2cc157 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -187,11 +187,18 @@ static int psp_v13_0_wait_for_bootloader(struct 
psp_context *psp)
 static int psp_v13_0_wait_for_bootloader_steady_state(struct psp_context *psp)
 {
struct amdgpu_device *adev = psp->adev;
+   int ret;
 
if (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6)) {
-   psp_v13_0_wait_for_vmbx_ready(psp);
+   ret = psp_v13_0_wait_for_vmbx_ready(psp);
+   if (ret)
+   amdgpu_ras_query_boot_status(adev, 4);
+
+   ret = psp_v13_0_wait_for_bootloader(psp);
+   if (ret)
+   amdgpu_ras_query_boot_status(adev, 4);
 
-   return psp_v13_0_wait_for_bootloader(psp);
+   return ret;
}
 
return 0;
-- 
2.17.1

[PATCH 4/5] drm/amdgpu: Query boot status if discovery failed

2024-01-02 Thread Hawking Zhang

Check and report boot status if discovery failed.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index b8fde08aec8e..302b71e9f1e2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -27,6 +27,7 @@
 #include "amdgpu_discovery.h"
 #include "soc15_hw_ip.h"
 #include "discovery.h"
+#include "amdgpu_ras.h"
 
 #include "soc15.h"
 #include "gfx_v9_0.h"
@@ -98,6 +99,7 @@
 #define FIRMWARE_IP_DISCOVERY "amdgpu/ip_discovery.bin"
 MODULE_FIRMWARE(FIRMWARE_IP_DISCOVERY);
 
+#define mmIP_DISCOVERY_VERSION  0x16A00
 #define mmRCC_CONFIG_MEMSIZE   0xde3
 #define mmMP0_SMN_C2PMSG_330x16061
 #define mmMM_INDEX 0x0
@@ -518,7 +520,9 @@ static int amdgpu_discovery_init(struct amdgpu_device *adev)
 out:
kfree(adev->mman.discovery_bin);
adev->mman.discovery_bin = NULL;
-
+   if ((amdgpu_discovery != 2) &&
+   (RREG32(mmIP_DISCOVERY_VERSION) == 4))
+   amdgpu_ras_query_boot_status(adev, 4);
return r;
 }
 
-- 
2.17.1

[PATCH 2/5] drm/amdgpu: Init pcie_index/data address as fallback

2024-01-02 Thread Hawking Zhang

To allow using this helper for indirect access when
nbio funcs is not available. For instance, in ip
discovery phase.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 +++-
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 001a35fa0f19..873419a5b9aa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -781,12 +781,22 @@ u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device 
*adev,
void __iomem *pcie_index_hi_offset;
void __iomem *pcie_data_offset;
 
-   pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
-   pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
-   if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
-   pcie_index_hi = 
adev->nbio.funcs->get_pcie_index_hi_offset(adev);
-   else
+   if (unlikely(!adev->nbio.funcs)) {
+   pcie_index = (0x38 >> 2);
+   pcie_data = (0x3C >> 2);
+   } else {
+   pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
+   pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
+   }
+
+   if (reg_addr >> 32) {
+   if (unlikely(!adev->nbio.funcs))
+   pcie_index_hi = (0x44 >> 2);
+   else
+   pcie_index_hi = 
adev->nbio.funcs->get_pcie_index_hi_offset(adev);
+   } else {
pcie_index_hi = 0;
+   }
 
spin_lock_irqsave(>pcie_idx_lock, flags);
pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
-- 
2.17.1

[PATCH 1/5] drm/amdgpu: drop psp v13 query_boot_status implementation

2024-01-02 Thread Hawking Zhang

Will replace it with new implementation to cover
boot fails in ip discovery phase.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c| 15 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h|  4 --
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 78 --
 4 files changed, 99 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 4b694696930e..001a35fa0f19 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1218,8 +1218,6 @@ static int amdgpu_device_asic_init(struct amdgpu_device 
*adev)
amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) {
amdgpu_psp_wait_for_bootloader(adev);
ret = amdgpu_atomfirmware_asic_init(adev, true);
-   /* TODO: check the return val and stop device initialization if 
boot fails */
-   amdgpu_psp_query_boot_status(adev);
return ret;
} else {
return amdgpu_atom_asic_init(adev->mode_info.atom_context);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 51bfe3757c89..0dc8686e54f7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -2128,21 +2128,6 @@ int amdgpu_psp_wait_for_bootloader(struct amdgpu_device 
*adev)
return ret;
 }
 
-int amdgpu_psp_query_boot_status(struct amdgpu_device *adev)
-{
-   struct psp_context *psp = >psp;
-   int ret = 0;
-
-   if (amdgpu_sriov_vf(adev) || (adev->flags & AMD_IS_APU))
-   return 0;
-
-   if (psp->funcs &&
-   psp->funcs->query_boot_status)
-   ret = psp->funcs->query_boot_status(psp);
-
-   return ret;
-}
-
 static int psp_hw_start(struct psp_context *psp)
 {
struct amdgpu_device *adev = psp->adev;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index c4d9cbde55b9..09d1f8f72a9c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -134,7 +134,6 @@ struct psp_funcs {
int (*update_spirom)(struct psp_context *psp, uint64_t fw_pri_mc_addr);
int (*vbflash_stat)(struct psp_context *psp);
int (*fatal_error_recovery_quirk)(struct psp_context *psp);
-   int (*query_boot_status)(struct psp_context *psp);
 };
 
 struct ta_funcs {
@@ -538,7 +537,4 @@ int psp_spatial_partition(struct psp_context *psp, int 
mode);
 int is_psp_fw_valid(struct psp_bin_desc bin);
 
 int amdgpu_psp_wait_for_bootloader(struct amdgpu_device *adev);
-
-int amdgpu_psp_query_boot_status(struct amdgpu_device *adev);
-
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index df1844d0800f..6fad451a85be 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -763,83 +763,6 @@ static int psp_v13_0_fatal_error_recovery_quirk(struct 
psp_context *psp)
return 0;
 }
 
-
-static void psp_v13_0_boot_error_reporting(struct amdgpu_device *adev,
-  uint32_t inst,
-  uint32_t boot_error)
-{
-   uint32_t socket_id;
-   uint32_t aid_id;
-   uint32_t hbm_id;
-   uint32_t reg_data;
-
-   socket_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, SOCKET_ID);
-   aid_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, AID_ID);
-   hbm_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, HBM_ID);
-
-   reg_data = RREG32_SOC15(MP0, inst, regMP0_SMN_C2PMSG_109);
-   dev_info(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw 
status is 0x%x\n",
-socket_id, aid_id, reg_data);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_MEM_TRAINING))
-   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory 
training failed\n",
-socket_id, aid_id, hbm_id);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_FW_LOAD))
-   dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed 
at boot time\n",
-socket_id, aid_id);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_WAFL_LINK_TRAINING))
-   dev_info(adev->dev, "socket: %d, aid: %d, wafl link training 
failed\n",
-socket_id, aid_id);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_XGMI_LINK_TRAINING))
-   dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training 
failed\n",
-socket_id, aid_id);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_USR_

[PATCH 3/3] drm/amdgpu: Replace DRM_* with dev_* in amdgpu_psp.c

2024-01-01 Thread Hawking Zhang

So kernel message has the device pcie bdf information,
which helps issue debugging especially in multiple GPU
system.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 144 
 1 file changed, 75 insertions(+), 69 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 8a3847d3041f..0d871479ff34 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -291,21 +291,22 @@ static int psp_memory_training_init(struct psp_context 
*psp)
struct psp_memory_training_context *ctx = >mem_train_ctx;
 
if (ctx->init != PSP_MEM_TRAIN_RESERVE_SUCCESS) {
-   DRM_DEBUG("memory training is not supported!\n");
+   dev_dbg(psp->adev->dev, "memory training is not supported!\n");
return 0;
}
 
ctx->sys_cache = kzalloc(ctx->train_data_size, GFP_KERNEL);
if (ctx->sys_cache == NULL) {
-   DRM_ERROR("alloc mem_train_ctx.sys_cache failed!\n");
+   dev_err(psp->adev->dev, "alloc mem_train_ctx.sys_cache 
failed!\n");
ret = -ENOMEM;
goto Err_out;
}
 
-   
DRM_DEBUG("train_data_size:%llx,p2c_train_data_offset:%llx,c2p_train_data_offset:%llx.\n",
- ctx->train_data_size,
- ctx->p2c_train_data_offset,
- ctx->c2p_train_data_offset);
+   dev_dbg(psp->adev->dev,
+   
"train_data_size:%llx,p2c_train_data_offset:%llx,c2p_train_data_offset:%llx.\n",
+   ctx->train_data_size,
+   ctx->p2c_train_data_offset,
+   ctx->c2p_train_data_offset);
ctx->init = PSP_MEM_TRAIN_INIT_SUCCESS;
return 0;
 
@@ -407,7 +408,7 @@ static int psp_sw_init(void *handle)
 
psp->cmd = kzalloc(sizeof(struct psp_gfx_cmd_resp), GFP_KERNEL);
if (!psp->cmd) {
-   DRM_ERROR("Failed to allocate memory to command buffer!\n");
+   dev_err(adev->dev, "Failed to allocate memory to command 
buffer!\n");
ret = -ENOMEM;
}
 
@@ -454,13 +455,13 @@ static int psp_sw_init(void *handle)
if (mem_training_ctx->enable_mem_training) {
ret = psp_memory_training_init(psp);
if (ret) {
-   DRM_ERROR("Failed to initialize memory training!\n");
+   dev_err(adev->dev, "Failed to initialize memory 
training!\n");
return ret;
}
 
ret = psp_mem_training(psp, PSP_MEM_TRAIN_COLD_BOOT);
if (ret) {
-   DRM_ERROR("Failed to process memory training!\n");
+   dev_err(adev->dev, "Failed to process memory 
training!\n");
return ret;
}
}
@@ -675,9 +676,11 @@ psp_cmd_submit_buf(struct psp_context *psp,
 */
if (!skip_unsupport && (psp->cmd_buf_mem->resp.status || !timeout) && 
!ras_intr) {
if (ucode)
-   DRM_WARN("failed to load ucode %s(0x%X) ",
- amdgpu_ucode_name(ucode->ucode_id), 
ucode->ucode_id);
-   DRM_WARN("psp gfx command %s(0x%X) failed and response status 
is (0x%X)\n",
+   dev_warn(psp->adev->dev,
+"failed to load ucode %s(0x%X) ",
+amdgpu_ucode_name(ucode->ucode_id), 
ucode->ucode_id);
+   dev_warn(psp->adev->dev,
+"psp gfx command %s(0x%X) failed and response status 
is (0x%X)\n",
 psp_gfx_cmd_name(psp->cmd_buf_mem->cmd_id), 
psp->cmd_buf_mem->cmd_id,
 psp->cmd_buf_mem->resp.status);
/* If any firmware (including CAP) load fails under SRIOV, it 
should
@@ -807,7 +810,7 @@ static int psp_tmr_init(struct psp_context *psp)
psp->fw_pri_buf) {
ret = psp_load_toc(psp, _size);
if (ret) {
-   DRM_ERROR("Failed to load toc\n");
+   dev_err(psp->adev->dev, "Failed to load toc\n");
return ret;
}
}
@@ -855,7 +858,7 @@ static int psp_tmr_load(struct psp_context *psp)
 
psp_prep_tmr_cmd_buf(psp, cmd, psp->tmr_mc_addr, psp->tmr_bo);
if (psp->tmr_bo)
-   DRM_INFO("reserve 0x%lx from 0x%llx for PSP TMR\n",
+   dev_info(psp->adev->dev, "reserve 0x%lx from 0x%llx for PSP 
TMR\n",
 amdgpu

[PATCH 2/3] Revert "drm/amdgpu: enable mca debug mode on APU by default"

2024-01-01 Thread Hawking Zhang

Not needed any more with firmware fixes

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 842405bb8995..d6e74b4dc6d2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3159,8 +3159,7 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
if (amdgpu_sriov_vf(adev))
return 0;
 
-   /* enable MCA debug on APU device */
-   amdgpu_ras_set_mca_debug_mode(adev, !!(adev->flags & AMD_IS_APU));
+   amdgpu_ras_set_mca_debug_mode(adev, false);
 
list_for_each_entry_safe(node, tmp, >ras_list, node) {
if (!node->ras_obj) {
-- 
2.17.1

[PATCH 1/3] drm/amdgpu: Packed socket_id to ras feature mask

2024-01-01 Thread Hawking Zhang

Initialize RAS feature mask bit[31:29] with socket_id.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 72b6e41329b0..842405bb8995 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2936,6 +2936,11 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
goto release_con;
}
 
+   /* Packed socket_id to ras feature mask bits[31:29] */
+   if (adev->smuio.funcs &&
+   adev->smuio.funcs->get_socket_id)
+   con->features |= ((adev->smuio.funcs->get_socket_id(adev)) << 
29);
+
/* Get RAS schema for particular SOC */
con->schema = amdgpu_get_ras_schema(adev);
 
-- 
2.17.1

[PATCH 3/3] drm/amdgpu: Centralize ras cap query to amdgpu_ras_check_supported

2024-01-01 Thread Hawking Zhang

Move ras capablity check to amdgpu_ras_check_supported.
Driver will query ras capablity through psp interace, or
vbios interface, or specific ip callbacks.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 170 +---
 1 file changed, 93 insertions(+), 77 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 5f302b7693b3..72b6e41329b0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -39,6 +39,7 @@
 #include "nbio_v7_9.h"
 #include "atom.h"
 #include "amdgpu_reset.h"
+#include "amdgpu_psp.h"
 
 #ifdef CONFIG_X86_MCE_AMD
 #include 
@@ -2680,6 +2681,87 @@ static void amdgpu_ras_get_quirks(struct amdgpu_device 
*adev)
adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX);
 }
 
+/* Query ras capablity via atomfirmware interface */
+static void amdgpu_ras_query_ras_capablity_from_vbios(struct amdgpu_device 
*adev)
+{
+   /* mem_ecc cap */
+   if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
+   dev_info(adev->dev, "MEM ECC is active.\n");
+   adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC |
+1 << AMDGPU_RAS_BLOCK__DF);
+   } else {
+   dev_info(adev->dev, "MEM ECC is not presented.\n");
+   }
+
+   /* sram_ecc cap */
+   if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
+   dev_info(adev->dev, "SRAM ECC is active.\n");
+   if (!amdgpu_sriov_vf(adev))
+   adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
+ 1 << AMDGPU_RAS_BLOCK__DF);
+   else
+   adev->ras_hw_enabled |= (1 << 
AMDGPU_RAS_BLOCK__PCIE_BIF |
+1 << AMDGPU_RAS_BLOCK__SDMA |
+1 << AMDGPU_RAS_BLOCK__GFX);
+
+   /*
+* VCN/JPEG RAS can be supported on both bare metal and
+* SRIOV environment
+*/
+   if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(2, 6, 0) 
||
+   amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 0) 
||
+   amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 3))
+   adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN |
+1 << AMDGPU_RAS_BLOCK__JPEG);
+   else
+   adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN |
+ 1 << AMDGPU_RAS_BLOCK__JPEG);
+
+   /*
+* XGMI RAS is not supported if xgmi num physical nodes
+* is zero
+*/
+   if (!adev->gmc.xgmi.num_physical_nodes)
+   adev->ras_hw_enabled &= ~(1 << 
AMDGPU_RAS_BLOCK__XGMI_WAFL);
+   } else {
+   dev_info(adev->dev, "SRAM ECC is not presented.\n");
+   }
+}
+
+/* Query poison mode from umc/df IP callbacks */
+static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev)
+{
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   bool df_poison, umc_poison;
+
+   /* poison setting is useless on SRIOV guest */
+   if (amdgpu_sriov_vf(adev) || !con)
+   return;
+
+   /* Init poison supported flag, the default value is false */
+   if (adev->gmc.xgmi.connected_to_cpu ||
+   adev->gmc.is_app_apu) {
+   /* enabled by default when GPU is connected to CPU */
+   con->poison_supported = true;
+   } else if (adev->df.funcs &&
+   adev->df.funcs->query_ras_poison_mode &&
+   adev->umc.ras &&
+   adev->umc.ras->query_ras_poison_mode) {
+   df_poison =
+   adev->df.funcs->query_ras_poison_mode(adev);
+   umc_poison =
+   adev->umc.ras->query_ras_poison_mode(adev);
+
+   /* Only poison is set in both DF and UMC, we can support it */
+   if (df_poison && umc_poison)
+   con->poison_supported = true;
+   else if (df_poison != umc_poison)
+   dev_warn(adev->dev,
+   "Poison setting is inconsistent in 
DF/UMC(%d:%d)!\n",
+   df_poison, umc_poison);
+   }
+}
+
 /*
  * check hardware's ras ability which will be saved in hw_supported.
  * if hardware does not support ras, we can skip some ras initializtion and
@@ -2696,4

[PATCH 2/3] drm/amdgpu: Query ras capablity from psp

2024-01-01 Thread Hawking Zhang

Instead of traditional atomfirmware interfaces for RAS
capability, host driver can query ras capability from
psp starting from psp v13_0_6.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 13 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h |  2 ++
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c  | 26 +
 3 files changed, 41 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 94b536e3cada..8a3847d3041f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -2125,6 +2125,19 @@ int amdgpu_psp_wait_for_bootloader(struct amdgpu_device 
*adev)
return ret;
 }
 
+bool amdgpu_psp_get_ras_capability(struct psp_context *psp)
+{
+   bool ret;
+
+   if (psp->funcs &&
+   psp->funcs->get_ras_capability) {
+   ret = psp->funcs->get_ras_capability(psp);
+   return ret;
+   } else {
+   return false;
+   }
+}
+
 static int psp_hw_start(struct psp_context *psp)
 {
struct amdgpu_device *adev = psp->adev;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index 09d1f8f72a9c..652b0a01854a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -134,6 +134,7 @@ struct psp_funcs {
int (*update_spirom)(struct psp_context *psp, uint64_t fw_pri_mc_addr);
int (*vbflash_stat)(struct psp_context *psp);
int (*fatal_error_recovery_quirk)(struct psp_context *psp);
+   bool (*get_ras_capability)(struct psp_context *psp);
 };
 
 struct ta_funcs {
@@ -537,4 +538,5 @@ int psp_spatial_partition(struct psp_context *psp, int 
mode);
 int is_psp_fw_valid(struct psp_bin_desc bin);
 
 int amdgpu_psp_wait_for_bootloader(struct amdgpu_device *adev);
+bool amdgpu_psp_get_ras_capability(struct psp_context *psp);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index 676bec2cc157..722b6066ce07 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -27,6 +27,7 @@
 #include "amdgpu_ucode.h"
 #include "soc15_common.h"
 #include "psp_v13_0.h"
+#include "amdgpu_ras.h"
 
 #include "mp/mp_13_0_2_offset.h"
 #include "mp/mp_13_0_2_sh_mask.h"
@@ -770,6 +771,30 @@ static int psp_v13_0_fatal_error_recovery_quirk(struct 
psp_context *psp)
return 0;
 }
 
+static bool psp_v13_0_get_ras_capability(struct psp_context *psp)
+{
+   struct amdgpu_device *adev = psp->adev;
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   u32 reg_data;
+
+   /* query ras cap should be done from host side */
+   if (amdgpu_sriov_vf(adev))
+   return false;
+
+   if (!con)
+   return false;
+
+   if ((amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6)) &&
+   (!(adev->flags & AMD_IS_APU))) {
+   reg_data = RREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_127);
+   adev->ras_hw_enabled = (reg_data & GENMASK_ULL(23, 0));
+   con->poison_supported = ((reg_data & GENMASK_ULL(24, 24)) >> 
24) ? true : false;
+   return true;
+   } else {
+   return false;
+   }
+}
+
 static const struct psp_funcs psp_v13_0_funcs = {
.init_microcode = psp_v13_0_init_microcode,
.wait_for_bootloader = psp_v13_0_wait_for_bootloader_steady_state,
@@ -792,6 +817,7 @@ static const struct psp_funcs psp_v13_0_funcs = {
.update_spirom = psp_v13_0_update_spirom,
.vbflash_stat = psp_v13_0_vbflash_status,
.fatal_error_recovery_quirk = psp_v13_0_fatal_error_recovery_quirk,
+   .get_ras_capability = psp_v13_0_get_ras_capability,
 };
 
 void psp_v13_0_set_psp_funcs(struct psp_context *psp)
-- 
2.17.1

[PATCH 1/3] drm/amdgpu: Align ras block enum with firmware

2024-01-01 Thread Hawking Zhang

Driver and firmware share the same ras block enum.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 5785b705c692..8b053602c5ca 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -70,6 +70,8 @@ enum amdgpu_ras_block {
AMDGPU_RAS_BLOCK__MCA,
AMDGPU_RAS_BLOCK__VCN,
AMDGPU_RAS_BLOCK__JPEG,
+   AMDGPU_RAS_BLOCK__IH,
+   AMDGPU_RAS_BLOCK__MPIO,
 
AMDGPU_RAS_BLOCK__LAST
 };
-- 
2.17.1

[PATCH 0/3] Add ras cap query from psp

2024-01-01 Thread Hawking Zhang

Driver can query RAS capability through psp or bios.

Hawking Zhang (3):
  drm/amdgpu: Align ras block enum with firmware
  drm/amdgpu: Query ras capablity from psp
  drm/amdgpu: Centralize ras cap query to amdgpu_ras_check_supported

 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c |  13 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h |   2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 170 +---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |   2 +
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c  |  26 
 5 files changed, 136 insertions(+), 77 deletions(-)

-- 
2.17.1

[PATCH 5/5] drm/amdgpu: Query boot status if boot failed

2024-01-01 Thread Hawking Zhang

Check and report firmware boot status if it doesn't
reach steady status.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index 6fad451a85be..676bec2cc157 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -187,11 +187,18 @@ static int psp_v13_0_wait_for_bootloader(struct 
psp_context *psp)
 static int psp_v13_0_wait_for_bootloader_steady_state(struct psp_context *psp)
 {
struct amdgpu_device *adev = psp->adev;
+   int ret;
 
if (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6)) {
-   psp_v13_0_wait_for_vmbx_ready(psp);
+   ret = psp_v13_0_wait_for_vmbx_ready(psp);
+   if (ret)
+   amdgpu_ras_query_boot_status(adev, 4);
+
+   ret = psp_v13_0_wait_for_bootloader(psp);
+   if (ret)
+   amdgpu_ras_query_boot_status(adev, 4);
 
-   return psp_v13_0_wait_for_bootloader(psp);
+   return ret;
}
 
return 0;
-- 
2.17.1

[PATCH 4/5] drm/amdgpu: Query boot status if discovery failed

2024-01-01 Thread Hawking Zhang

Check and report boot status if discovery failed.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index b8fde08aec8e..302b71e9f1e2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -27,6 +27,7 @@
 #include "amdgpu_discovery.h"
 #include "soc15_hw_ip.h"
 #include "discovery.h"
+#include "amdgpu_ras.h"
 
 #include "soc15.h"
 #include "gfx_v9_0.h"
@@ -98,6 +99,7 @@
 #define FIRMWARE_IP_DISCOVERY "amdgpu/ip_discovery.bin"
 MODULE_FIRMWARE(FIRMWARE_IP_DISCOVERY);
 
+#define mmIP_DISCOVERY_VERSION  0x16A00
 #define mmRCC_CONFIG_MEMSIZE   0xde3
 #define mmMP0_SMN_C2PMSG_330x16061
 #define mmMM_INDEX 0x0
@@ -518,7 +520,9 @@ static int amdgpu_discovery_init(struct amdgpu_device *adev)
 out:
kfree(adev->mman.discovery_bin);
adev->mman.discovery_bin = NULL;
-
+   if ((amdgpu_discovery != 2) &&
+   (RREG32(mmIP_DISCOVERY_VERSION) == 4))
+   amdgpu_ras_query_boot_status(adev, 4);
return r;
 }
 
-- 
2.17.1

[PATCH 3/5] drm/amdgpu: Add ras helper to query boot errors

2024-01-01 Thread Hawking Zhang

Add ras helper function to query boot time gpu
errors.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  3 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 95 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 15 +++-
 3 files changed, 112 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 616b6c911767..db44ec857a31 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1328,6 +1328,9 @@ int emu_soc_asic_init(struct amdgpu_device *adev);
 #define WREG32_FIELD_OFFSET(reg, offset, field, val)   \
WREG32(mm##reg + offset, (RREG32(mm##reg + offset) & 
~REG_FIELD_MASK(reg, field)) | (val) << REG_FIELD_SHIFT(reg, field))
 
+#define AMDGPU_SMN_TARGET_AID(x) ((u64)(x) << 32)
+#define AMDGPU_SMN_CROSS_AID (1ULL << 34)
+#define AMDGPU_GET_REG_FIELD(x, h, l) (((x) & GENMASK_ULL(h, l)) >> (l))
 /*
  * BIOS helpers.
  */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 39399d0f2ce5..5f302b7693b3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3764,3 +3764,98 @@ int amdgpu_ras_error_statistic_ce_count(struct 
ras_err_data *err_data,
 
return 0;
 }
+
+#define mmMP0_SMN_C2PMSG_920x1609C
+#define mmMP0_SMN_C2PMSG_126   0x160BE
+static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev,
+u32 instance, u32 boot_error)
+{
+   u32 socket_id, aid_id, hbm_id;
+   u32 reg_data;
+   u64 reg_addr;
+
+   socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error);
+   aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error);
+   hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error);
+
+   if (instance)
+   reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) +
+  AMDGPU_SMN_TARGET_AID(instance) +
+  AMDGPU_SMN_CROSS_AID;
+   else
+   reg_addr = (mmMP0_SMN_C2PMSG_92 << 2);
+
+   reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
+   dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw 
status is 0x%x\n",
+   socket_id, aid_id, reg_data);
+
+   if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory 
training failed\n",
+socket_id, aid_id, hbm_id);
+
+   if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed 
at boot time\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, wafl link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm memory 
test failed\n",
+socket_id, aid_id, hbm_id);
+
+   if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm bist 
test failed\n",
+socket_id, aid_id, hbm_id);
+}
+
+static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev,
+u32 instance, u32 *boot_error)
+{
+   u32 reg_addr;
+   u32 reg_data;
+   int retry_loop;
+
+   if (instance)
+   reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) +
+  AMDGPU_SMN_TARGET_AID(instance) +
+  AMDGPU_SMN_CROSS_AID;
+   else
+   reg_addr = (mmMP0_SMN_C2PMSG_126 << 2);
+
+   for (retry_loop = 0; retry_loop < 1000; retry_loop++) {
+   reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
+   if (AMDGPU_RAS_GPU_ERR_BOOT_STATUS(reg_data)) {
+   *boot_error = reg_data;
+   return 0;
+   }
+   msleep(1);
+   }
+
+   *boot_error = reg_data;
+   return -ETIME;

[PATCH 2/5] drm/amdgpu: Init pcie_index/data address as fallback

2024-01-01 Thread Hawking Zhang

To allow using this helper for indirect access when
nbio funcs is not available. For instance, in ip
discovery phase.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 +++-
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 001a35fa0f19..873419a5b9aa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -781,12 +781,22 @@ u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device 
*adev,
void __iomem *pcie_index_hi_offset;
void __iomem *pcie_data_offset;
 
-   pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
-   pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
-   if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
-   pcie_index_hi = 
adev->nbio.funcs->get_pcie_index_hi_offset(adev);
-   else
+   if (unlikely(!adev->nbio.funcs)) {
+   pcie_index = (0x38 >> 2);
+   pcie_data = (0x3C >> 2);
+   } else {
+   pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
+   pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
+   }
+
+   if (reg_addr >> 32) {
+   if (unlikely(!adev->nbio.funcs))
+   pcie_index_hi = (0x44 >> 2);
+   else
+   pcie_index_hi = 
adev->nbio.funcs->get_pcie_index_hi_offset(adev);
+   } else {
pcie_index_hi = 0;
+   }
 
spin_lock_irqsave(>pcie_idx_lock, flags);
pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
-- 
2.17.1

[PATCH 1/5] drm/amdgpu: drop psp v13 query_boot_status implementation

2024-01-01 Thread Hawking Zhang

Will replace it with new implementation to cover
boot fails in ip discovery phase.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c| 15 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h|  4 --
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 78 --
 4 files changed, 99 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 4b694696930e..001a35fa0f19 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1218,8 +1218,6 @@ static int amdgpu_device_asic_init(struct amdgpu_device 
*adev)
amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) {
amdgpu_psp_wait_for_bootloader(adev);
ret = amdgpu_atomfirmware_asic_init(adev, true);
-   /* TODO: check the return val and stop device initialization if 
boot fails */
-   amdgpu_psp_query_boot_status(adev);
return ret;
} else {
return amdgpu_atom_asic_init(adev->mode_info.atom_context);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 1bf975b8d083..94b536e3cada 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -2125,21 +2125,6 @@ int amdgpu_psp_wait_for_bootloader(struct amdgpu_device 
*adev)
return ret;
 }
 
-int amdgpu_psp_query_boot_status(struct amdgpu_device *adev)
-{
-   struct psp_context *psp = >psp;
-   int ret = 0;
-
-   if (amdgpu_sriov_vf(adev) || (adev->flags & AMD_IS_APU))
-   return 0;
-
-   if (psp->funcs &&
-   psp->funcs->query_boot_status)
-   ret = psp->funcs->query_boot_status(psp);
-
-   return ret;
-}
-
 static int psp_hw_start(struct psp_context *psp)
 {
struct amdgpu_device *adev = psp->adev;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index c4d9cbde55b9..09d1f8f72a9c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -134,7 +134,6 @@ struct psp_funcs {
int (*update_spirom)(struct psp_context *psp, uint64_t fw_pri_mc_addr);
int (*vbflash_stat)(struct psp_context *psp);
int (*fatal_error_recovery_quirk)(struct psp_context *psp);
-   int (*query_boot_status)(struct psp_context *psp);
 };
 
 struct ta_funcs {
@@ -538,7 +537,4 @@ int psp_spatial_partition(struct psp_context *psp, int 
mode);
 int is_psp_fw_valid(struct psp_bin_desc bin);
 
 int amdgpu_psp_wait_for_bootloader(struct amdgpu_device *adev);
-
-int amdgpu_psp_query_boot_status(struct amdgpu_device *adev);
-
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index df1844d0800f..6fad451a85be 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -763,83 +763,6 @@ static int psp_v13_0_fatal_error_recovery_quirk(struct 
psp_context *psp)
return 0;
 }
 
-
-static void psp_v13_0_boot_error_reporting(struct amdgpu_device *adev,
-  uint32_t inst,
-  uint32_t boot_error)
-{
-   uint32_t socket_id;
-   uint32_t aid_id;
-   uint32_t hbm_id;
-   uint32_t reg_data;
-
-   socket_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, SOCKET_ID);
-   aid_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, AID_ID);
-   hbm_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, HBM_ID);
-
-   reg_data = RREG32_SOC15(MP0, inst, regMP0_SMN_C2PMSG_109);
-   dev_info(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw 
status is 0x%x\n",
-socket_id, aid_id, reg_data);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_MEM_TRAINING))
-   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory 
training failed\n",
-socket_id, aid_id, hbm_id);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_FW_LOAD))
-   dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed 
at boot time\n",
-socket_id, aid_id);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_WAFL_LINK_TRAINING))
-   dev_info(adev->dev, "socket: %d, aid: %d, wafl link training 
failed\n",
-socket_id, aid_id);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_XGMI_LINK_TRAINING))
-   dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training 
failed\n",
-socket_id, aid_id);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_USR_

[PATCH 0/5] Add boot time error reporting

2024-01-01 Thread Hawking Zhang

For ASICs that support boot time error reporting, poll all
the boot time errors cached in registers and make it available
in kernel log.

Hawking Zhang (5):
  drm/amdgpu: drop psp v13 query_boot_status implementation
  drm/amdgpu: Init pcie_index/data address as fallback
  drm/amdgpu: Add ras helper to query boot errors
  drm/amdgpu: Query boot status if discovery failed
  drm/amdgpu: Query boot status if boot failed

 drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  3 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c| 22 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c |  6 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c   | 15 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h   |  4 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c   | 95 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h   | 15 ++-
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c| 89 ++---
 8 files changed, 141 insertions(+), 108 deletions(-)

-- 
2.17.1

[PATCH] drm/amdgpu: Switch to aca bank for xgmi pcs err cnt

2023-12-12 Thread Hawking Zhang

Instead of software managed counters.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h  | 2 ++
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 6 --
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
index e51e8918e667..b399f1b62887 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
@@ -46,6 +46,8 @@
 #define MCA_REG__STATUS__ERRORCODEEXT(x)   MCA_REG_FIELD(x, 21, 16)
 #define MCA_REG__STATUS__ERRORCODE(x)  MCA_REG_FIELD(x, 15, 0)
 
+#define MCA_REG__MISC0__ERRCNT(x)  MCA_REG_FIELD(x, 43, 32)
+
 #define MCA_REG__SYND__ERRORINFORMATION(x) MCA_REG_FIELD(x, 17, 0)
 
 enum amdgpu_mca_ip {
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index ddd782fbee7a..3998c9b31d07 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2537,13 +2537,15 @@ static int mca_pcs_xgmi_mca_get_err_count(const struct 
mca_ras_info *mca_ras, st
  uint32_t *count)
 {
u32 ext_error_code;
+   u32 err_cnt;
 
ext_error_code = 
MCA_REG__STATUS__ERRORCODEEXT(entry->regs[MCA_REG_IDX_STATUS]);
+   err_cnt = MCA_REG__MISC0__ERRCNT(entry->regs[MCA_REG_IDX_MISC0]);
 
if (type == AMDGPU_MCA_ERROR_TYPE_UE && ext_error_code == 0)
-   *count = 1;
+   *count = err_cnt;
else if (type == AMDGPU_MCA_ERROR_TYPE_CE && ext_error_code == 6)
-   *count = 1;
+   *count = err_cnt;
 
return 0;
 }
-- 
2.17.1

[PATCH 3/3] drm/amdgpu: Update fw version for boot time error query

2023-11-19 Thread Hawking Zhang

Boot time error query is not available till a10109

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index 3cf4684d0d3f..5f46877f78cf 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -821,7 +821,7 @@ static int psp_v13_0_query_boot_status(struct psp_context 
*psp)
if (amdgpu_ip_version(adev, MP0_HWIP, 0) != IP_VERSION(13, 0, 6))
return 0;
 
-   if (RREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_59) < 0x00a10007)
+   if (RREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_59) < 0x00a10109)
return 0;
 
for_each_inst(i, inst_mask) {
-- 
2.17.1

[PATCH 2/3] drm/amdgpu: Do not issue gpu reset from nbio v7_9 bif interrupt

2023-11-19 Thread Hawking Zhang

In nbio v7_9, host driver should not issu gpu reset

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c 
b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
index 23f26f8caad4..25a3da83e0fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
@@ -611,11 +611,6 @@ static void 
nbio_v7_9_handle_ras_controller_intr_no_bifring(struct amdgpu_device
 
dev_info(adev->dev, "RAS controller interrupt triggered "
"by NBIF error\n");
-
-   /* ras_controller_int is dedicated for nbif ras error,
-* not the global interrupt for sync flood
-*/
-   amdgpu_ras_reset_gpu(adev);
}
 
amdgpu_ras_error_data_fini(_data);
-- 
2.17.1

[PATCH 1/3] drm/amdgpu: Retire query/reseet_ras_error_status from gfx_v9_4_3

2023-11-19 Thread Hawking Zhang

Not needed anymore.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 114 
 1 file changed, 114 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index 40d06d32bb74..5df727be88c4 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -3882,32 +3882,6 @@ static void gfx_v9_4_3_inst_reset_ras_err_count(struct 
amdgpu_device *adev,
mutex_unlock(>grbm_idx_mutex);
 }
 
-static void gfx_v9_4_3_inst_query_utc_err_status(struct amdgpu_device *adev,
-   int xcc_id)
-{
-   uint32_t data;
-
-   data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regUTCL2_MEM_ECC_STATUS);
-   if (data) {
-   dev_warn(adev->dev, "GFX UTCL2 Mem Ecc Status: 0x%x!\n", data);
-   WREG32_SOC15(GC, GET_INST(GC, xcc_id), regUTCL2_MEM_ECC_STATUS, 
0x3);
-   }
-
-   data = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regVML2_MEM_ECC_STATUS);
-   if (data) {
-   dev_warn(adev->dev, "GFX VML2 Mem Ecc Status: 0x%x!\n", data);
-   WREG32_SOC15(GC, GET_INST(GC, xcc_id), regVML2_MEM_ECC_STATUS, 
0x3);
-   }
-
-   data = RREG32_SOC15(GC, GET_INST(GC, xcc_id),
-   regVML2_WALKER_MEM_ECC_STATUS);
-   if (data) {
-   dev_warn(adev->dev, "GFX VML2 Walker Mem Ecc Status: 0x%x!\n", 
data);
-   WREG32_SOC15(GC, GET_INST(GC, xcc_id), 
regVML2_WALKER_MEM_ECC_STATUS,
-   0x3);
-   }
-}
-
 static void gfx_v9_4_3_log_cu_timeout_status(struct amdgpu_device *adev,
uint32_t status, int xcc_id)
 {
@@ -3950,82 +3924,6 @@ static void gfx_v9_4_3_log_cu_timeout_status(struct 
amdgpu_device *adev,
}
 }
 
-static void gfx_v9_4_3_inst_query_sq_timeout_status(struct amdgpu_device *adev,
-   int xcc_id)
-{
-   uint32_t se_idx, sh_idx, cu_idx;
-   uint32_t status;
-
-   mutex_lock(>grbm_idx_mutex);
-   for (se_idx = 0; se_idx < adev->gfx.config.max_shader_engines; 
se_idx++) {
-   for (sh_idx = 0; sh_idx < adev->gfx.config.max_sh_per_se; 
sh_idx++) {
-   for (cu_idx = 0; cu_idx < 
adev->gfx.config.max_cu_per_sh; cu_idx++) {
-   gfx_v9_4_3_xcc_select_se_sh(adev, se_idx, 
sh_idx,
-   cu_idx, xcc_id);
-   status = RREG32_SOC15(GC, GET_INST(GC, xcc_id),
- regSQ_TIMEOUT_STATUS);
-   if (status != 0) {
-   dev_info(
-   adev->dev,
-   "GFX Watchdog Timeout: SE %d, 
SH %d, CU %d\n",
-   se_idx, sh_idx, cu_idx);
-   gfx_v9_4_3_log_cu_timeout_status(
-   adev, status, xcc_id);
-   }
-   /* clear old status */
-   WREG32_SOC15(GC, GET_INST(GC, xcc_id),
-   regSQ_TIMEOUT_STATUS, 0);
-   }
-   }
-   }
-   gfx_v9_4_3_xcc_select_se_sh(adev, 0x, 0x, 0x,
-   xcc_id);
-   mutex_unlock(>grbm_idx_mutex);
-}
-
-static void gfx_v9_4_3_inst_query_ras_err_status(struct amdgpu_device *adev,
-   void *ras_error_status, int xcc_id)
-{
-   gfx_v9_4_3_inst_query_utc_err_status(adev, xcc_id);
-   gfx_v9_4_3_inst_query_sq_timeout_status(adev, xcc_id);
-}
-
-static void gfx_v9_4_3_inst_reset_utc_err_status(struct amdgpu_device *adev,
-   int xcc_id)
-{
-   WREG32_SOC15(GC, GET_INST(GC, xcc_id), regUTCL2_MEM_ECC_STATUS, 0x3);
-   WREG32_SOC15(GC, GET_INST(GC, xcc_id), regVML2_MEM_ECC_STATUS, 0x3);
-   WREG32_SOC15(GC, GET_INST(GC, xcc_id), regVML2_WALKER_MEM_ECC_STATUS, 
0x3);
-}
-
-static void gfx_v9_4_3_inst_reset_sq_timeout_status(struct amdgpu_device *adev,
-   int xcc_id)
-{
-   uint32_t se_idx, sh_idx, cu_idx;
-
-   mutex_lock(>grbm_idx_mutex);
-   for (se_idx = 0; se_idx < adev->gfx.config.max_shader_engines; 
se_idx++) {
-   for (sh_idx = 0; sh_idx < adev->gfx.config.max_sh_per_se; 
sh_idx++) {
-   for (cu_idx = 0; cu_idx < 
adev->gfx.config.max_cu_per_sh; cu_idx++) {
-   gfx_v9_4_3_xcc_select_se_sh(adev, se_idx, 
sh_idx,
-

[PATCH 3/3] drm/amdgpu: Query and report boot status

2023-11-01 Thread Hawking Zhang

Query boot status and report boot errors. A follow
up change is needed to stop GPU initialization if boot
fails.

v2: only invoke the call for dGPU (Le/Lijo)

Signed-off-by: Hawking Zhang 
Reviewed-by: Tao Zhou 
Reviewed-by: Yang Wang 
Reviewed-by: Le Ma 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index c317a4869492..02d6246df938 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1070,6 +1070,8 @@ static int amdgpu_device_asic_init(struct amdgpu_device 
*adev)
amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) {
amdgpu_psp_wait_for_bootloader(adev);
ret = amdgpu_atomfirmware_asic_init(adev, true);
+   /* TODO: check the return val and stop device initialization if 
boot fails */
+   amdgpu_psp_query_boot_status(adev);
return ret;
} else {
return amdgpu_atom_asic_init(adev->mode_info.atom_context);
-- 
2.17.1

[PATCH 2/3] drm/amdgpu: Add psp v13 function to query boot status

2023-11-01 Thread Hawking Zhang

Add psp v13 function to query boot status.

v2: limit the use case to dGPU only (Lijo)

Signed-off-by: Hawking Zhang 
Reviewed-by: Tao Zhou 
Reviewed-by: Yang Wang 
Reviewed-by: Le Ma 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 15 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h |  3 +
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c  | 78 +
 3 files changed, 96 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index e6dc3cfbac0e..66d9c189af29 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -2120,6 +2120,21 @@ int amdgpu_psp_wait_for_bootloader(struct amdgpu_device 
*adev)
return ret;
 }
 
+int amdgpu_psp_query_boot_status(struct amdgpu_device *adev)
+{
+   struct psp_context *psp = >psp;
+   int ret;
+
+   if (amdgpu_sriov_vf(adev) || (adev->flags & AMD_IS_APU))
+   return 0;
+
+   if (psp->funcs &&
+   psp->funcs->query_boot_status)
+   ret = psp->funcs->query_boot_status(psp);
+
+   return ret;
+}
+
 static int psp_hw_start(struct psp_context *psp)
 {
struct amdgpu_device *adev = psp->adev;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index 7111dd32e66f..5d36ad3f48c7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -134,6 +134,7 @@ struct psp_funcs {
int (*update_spirom)(struct psp_context *psp, uint64_t fw_pri_mc_addr);
int (*vbflash_stat)(struct psp_context *psp);
int (*fatal_error_recovery_quirk)(struct psp_context *psp);
+   int (*query_boot_status)(struct psp_context *psp);
 };
 
 struct ta_funcs {
@@ -537,4 +538,6 @@ int is_psp_fw_valid(struct psp_bin_desc bin);
 
 int amdgpu_psp_wait_for_bootloader(struct amdgpu_device *adev);
 
+int amdgpu_psp_query_boot_status(struct amdgpu_device *adev);
+
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index 4142e2fcd866..3cf4684d0d3f 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -759,6 +759,83 @@ static int psp_v13_0_fatal_error_recovery_quirk(struct 
psp_context *psp)
return 0;
 }
 
+
+static void psp_v13_0_boot_error_reporting(struct amdgpu_device *adev,
+  uint32_t inst,
+  uint32_t boot_error)
+{
+   uint32_t socket_id;
+   uint32_t aid_id;
+   uint32_t hbm_id;
+   uint32_t reg_data;
+
+   socket_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, SOCKET_ID);
+   aid_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, AID_ID);
+   hbm_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, HBM_ID);
+
+   reg_data = RREG32_SOC15(MP0, inst, regMP0_SMN_C2PMSG_109);
+   dev_info(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw 
status is 0x%x\n",
+socket_id, aid_id, reg_data);
+
+   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_MEM_TRAINING))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory 
training failed\n",
+socket_id, aid_id, hbm_id);
+
+   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_FW_LOAD))
+   dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed 
at boot time\n",
+socket_id, aid_id);
+
+   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_WAFL_LINK_TRAINING))
+   dev_info(adev->dev, "socket: %d, aid: %d, wafl link training 
failed\n",
+socket_id, aid_id);
+
+   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_XGMI_LINK_TRAINING))
+   dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training 
failed\n",
+socket_id, aid_id);
+
+   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_USR_CP_LINK_TRAINING))
+   dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training 
failed\n",
+socket_id, aid_id);
+
+   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_USR_DP_LINK_TRAINING))
+   dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training 
failed\n",
+socket_id, aid_id);
+
+   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_HBM_MEM_TEST))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm memory 
test failed\n",
+socket_id, aid_id, hbm_id);
+
+   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_HBM_BIST_TEST))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm bist 
test failed\n",
+

[PATCH 1/3] drm/amdgpu: Add C2PMSG_109/126 reg field shift/masks

2023-11-01 Thread Hawking Zhang

Add MP0_C2PMSG_109/126 register field shift/masks
that are used to identify boot status by driver.

Signed-off-by: Hawking Zhang 
Reviewed-by: Tao Zhou 
Reviewed-by: Yang Wang 
Reviewed-by: Le Ma 
---
 .../include/asic_reg/mp/mp_13_0_2_sh_mask.h   | 28 +++
 1 file changed, 28 insertions(+)

diff --git a/drivers/gpu/drm/amd/include/asic_reg/mp/mp_13_0_2_sh_mask.h 
b/drivers/gpu/drm/amd/include/asic_reg/mp/mp_13_0_2_sh_mask.h
index 6e29a185de51..765d9ca2316f 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/mp/mp_13_0_2_sh_mask.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/mp/mp_13_0_2_sh_mask.h
@@ -242,6 +242,34 @@
 //MP0_SMN_C2PMSG_103
 #define MP0_SMN_C2PMSG_103__CONTENT__SHIFT 
   0x0
 #define MP0_SMN_C2PMSG_103__CONTENT_MASK   
   0xL
+//MP0_SMN_C2PMSG_109
+#define MP0_SMN_C2PMSG_109__CONTENT__SHIFT 
   0x0
+#define MP0_SMN_C2PMSG_109__CONTENT_MASK   
   0xL
+//MP0_SMN_C2PMSG_126
+#define MP0_SMN_C2PMSG_126__GPU_ERR_MEM_TRAINING__SHIFT
   0x0
+#define MP0_SMN_C2PMSG_126__GPU_ERR_FW_LOAD__SHIFT 
   0x1
+#define MP0_SMN_C2PMSG_126__GPU_ERR_WAFL_LINK_TRAINING__SHIFT  
   0x2
+#define MP0_SMN_C2PMSG_126__GPU_ERR_XGMI_LINK_TRAINING__SHIFT  
   0x3
+#define MP0_SMN_C2PMSG_126__GPU_ERR_USR_CP_LINK_TRAINING__SHIFT
   0x4
+#define MP0_SMN_C2PMSG_126__GPU_ERR_USR_DP_LINK_TRAINING__SHIFT
   0x5
+#define MP0_SMN_C2PMSG_126__GPU_ERR_HBM_MEM_TEST__SHIFT
   0x6
+#define MP0_SMN_C2PMSG_126__GPU_ERR_HBM_BIST_TEST__SHIFT   
   0x7
+#define MP0_SMN_C2PMSG_126__SOCKET_ID__SHIFT   
   0x8
+#define MP0_SMN_C2PMSG_126__AID_ID__SHIFT  
   0xb
+#define MP0_SMN_C2PMSG_126__HBM_ID__SHIFT  
   0xd
+#define MP0_SMN_C2PMSG_126__BOOT_STATUS__SHIFT 
   0x1f
+#define MP0_SMN_C2PMSG_126__GPU_ERR_MEM_TRAINING_MASK  
   0x0001L
+#define MP0_SMN_C2PMSG_126__GPU_ERR_FW_LOAD_MASK   
   0x0002L
+#define MP0_SMN_C2PMSG_126__GPU_ERR_WAFL_LINK_TRAINING_MASK
   0x0004L
+#define MP0_SMN_C2PMSG_126__GPU_ERR_XGMI_LINK_TRAINING_MASK
   0x0008L
+#define MP0_SMN_C2PMSG_126__GPU_ERR_USR_CP_LINK_TRAINING_MASK  
   0x0010L
+#define MP0_SMN_C2PMSG_126__GPU_ERR_USR_DP_LINK_TRAINING_MASK  
   0x0020L
+#define MP0_SMN_C2PMSG_126__GPU_ERR_HBM_MEM_TEST_MASK  
   0x0040L
+#define MP0_SMN_C2PMSG_126__GPU_ERR_HBM_BIST_TEST_MASK 
   0x0080L
+#define MP0_SMN_C2PMSG_126__SOCKET_ID_MASK 
   0x0700L
+#define MP0_SMN_C2PMSG_126__AID_ID_MASK
   0x1800L
+#define MP0_SMN_C2PMSG_126__HBM_ID_MASK
   0x2000L
+#define MP0_SMN_C2PMSG_126__BOOT_STATUS_MASK   
   0x8000L
 //MP0_SMN_IH_CREDIT
 #define MP0_SMN_IH_CREDIT__CREDIT_VALUE__SHIFT 
   0x0
 #define MP0_SMN_IH_CREDIT__CLIENT_ID__SHIFT
   0x10
-- 
2.17.1

[PATCH 2/2] drm/amdgpu: Add UVD_VCPU_INT_EN2 to dpg sram

2023-10-18 Thread Hawking Zhang

Add RAS sepcifc programming to dpg sram.

Signed-off-by: Hawking Zhang 
Reviewed-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
index f85d18cd74ec..810bbfccd6f2 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
@@ -1760,6 +1760,11 @@ static void vcn_v4_0_3_enable_ras(struct amdgpu_device 
*adev,
  SOC15_DPG_MODE_OFFSET(VCN, 0, regVCN_RAS_CNTL),
  tmp, 0, indirect);
 
+   tmp = UVD_VCPU_INT_EN2__RASCNTL_VCPU_VCODEC_EN_MASK;
+   WREG32_SOC15_DPG_MODE(inst_idx,
+ SOC15_DPG_MODE_OFFSET(VCN, 0, 
regUVD_VCPU_INT_EN2),
+ tmp, 0, indirect);
+
tmp = UVD_SYS_INT_EN__RASCNTL_VCPU_VCODEC_EN_MASK;
WREG32_SOC15_DPG_MODE(inst_idx,
  SOC15_DPG_MODE_OFFSET(VCN, 0, regUVD_SYS_INT_EN),
-- 
2.17.1

[PATCH 1/2] drm/amdgpu: Enable software RAS in vcn v4_0_3

2023-10-18 Thread Hawking Zhang

Set VCN/JPEG RAS masks to enable software RAS for
VCN and JPEG.

Signed-off-by: Hawking Zhang 
Reviewed-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 973073e07b2a..7de1eb7c959d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2606,7 +2606,9 @@ static void amdgpu_ras_check_supported(struct 
amdgpu_device *adev)
if (amdgpu_ip_version(adev, VCN_HWIP, 0) ==
IP_VERSION(2, 6, 0) ||
amdgpu_ip_version(adev, VCN_HWIP, 0) ==
-   IP_VERSION(4, 0, 0))
+   IP_VERSION(4, 0, 0) ||
+   amdgpu_ip_version(adev, VCN_HWIP, 0) ==
+   IP_VERSION(4, 0, 3))
adev->ras_hw_enabled |= (1 << 
AMDGPU_RAS_BLOCK__VCN |
1 << 
AMDGPU_RAS_BLOCK__JPEG);
else
-- 
2.17.1

[PATCH] drm/amdgpu: fallback to old RAS error message for aqua_vanjaram

2023-09-08 Thread Hawking Zhang

So driver doesn't generate incorrect message until
the new format is settled down for aqua_vanjaram

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 8eb6f6943778..632478874f7d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1053,7 +1053,8 @@ int amdgpu_ras_query_error_status(struct amdgpu_device 
*adev,
info->ce_count = obj->err_data.ce_count;
 
if (err_data.ce_count) {
-   if (adev->smuio.funcs &&
+   if (!adev->aid_mask &&
+   adev->smuio.funcs &&
adev->smuio.funcs->get_socket_id &&
adev->smuio.funcs->get_die_id) {
dev_info(adev->dev, "socket: %d, die: %d "
@@ -1073,7 +1074,8 @@ int amdgpu_ras_query_error_status(struct amdgpu_device 
*adev,
}
}
if (err_data.ue_count) {
-   if (adev->smuio.funcs &&
+   if (!adev->aid_mask &&
+   adev->smuio.funcs &&
adev->smuio.funcs->get_socket_id &&
adev->smuio.funcs->get_die_id) {
dev_info(adev->dev, "socket: %d, die: %d "
-- 
2.17.1

[PATCH] drm/amdgpu: fallback to old RAS error message for aqua_vanjaram

2023-09-08 Thread Hawking Zhang

So driver doesn't generate incorrect message until
the new format is settled down for aqua_vanjaram

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 8eb6f6943778..dee7b5b705e7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1053,7 +1053,8 @@ int amdgpu_ras_query_error_status(struct amdgpu_device 
*adev,
info->ce_count = obj->err_data.ce_count;
 
if (err_data.ce_count) {
-   if (adev->smuio.funcs &&
+   if (!adev->aid_mask &&
+   adev->smuio.funcs &&
adev->smuio.funcs->get_socket_id &&
adev->smuio.funcs->get_die_id) {
dev_info(adev->dev, "socket: %d, die: %d "
-- 
2.17.1

[PATCH] drm/amdgpu: Correct se_num and reg_inst for gfx v9_4_3 ras counters

2023-09-06 Thread Hawking Zhang

gfx_v9_4_3_ue|ce_reg_list is an array per gfx core instance
correct the settings of se_num and reg_inst for some of
gfx ras counters so all the available register instances
can be polled for ras status.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 40 -
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index 0a26a00074a6..a60d1a8405d4 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -3653,19 +3653,19 @@ static const struct amdgpu_gfx_ras_reg_entry 
gfx_v9_4_3_ce_reg_list[] = {
AMDGPU_GFX_GC_CANE_MEM, 1},
{{AMDGPU_RAS_REG_ENTRY(GC, 0, regSPI_CE_ERR_STATUS_LO, 
regSPI_CE_ERR_STATUS_HI),
1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), 
"SPI"},
-   AMDGPU_GFX_SPI_MEM, 8},
+   AMDGPU_GFX_SPI_MEM, 1},
{{AMDGPU_RAS_REG_ENTRY(GC, 0, regSP0_CE_ERR_STATUS_LO, 
regSP0_CE_ERR_STATUS_HI),
10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), 
"SP0"},
-   AMDGPU_GFX_SP_MEM, 1},
+   AMDGPU_GFX_SP_MEM, 4},
{{AMDGPU_RAS_REG_ENTRY(GC, 0, regSP1_CE_ERR_STATUS_LO, 
regSP1_CE_ERR_STATUS_HI),
10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), 
"SP1"},
-   AMDGPU_GFX_SP_MEM, 1},
+   AMDGPU_GFX_SP_MEM, 4},
{{AMDGPU_RAS_REG_ENTRY(GC, 0, regSQ_CE_ERR_STATUS_LO, 
regSQ_CE_ERR_STATUS_HI),
10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), 
"SQ"},
-   AMDGPU_GFX_SQ_MEM, 8},
+   AMDGPU_GFX_SQ_MEM, 4},
{{AMDGPU_RAS_REG_ENTRY(GC, 0, regSQC_CE_EDC_LO, regSQC_CE_EDC_HI),
5, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), 
"SQC"},
-   AMDGPU_GFX_SQC_MEM, 8},
+   AMDGPU_GFX_SQC_MEM, 4},
{{AMDGPU_RAS_REG_ENTRY(GC, 0, regTCX_CE_ERR_STATUS_LO, 
regTCX_CE_ERR_STATUS_HI),
2, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), 
"TCX"},
AMDGPU_GFX_TCX_MEM, 1},
@@ -3674,22 +3674,22 @@ static const struct amdgpu_gfx_ras_reg_entry 
gfx_v9_4_3_ce_reg_list[] = {
AMDGPU_GFX_TCC_MEM, 1},
{{AMDGPU_RAS_REG_ENTRY(GC, 0, regTA_CE_EDC_LO, regTA_CE_EDC_HI),
10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), 
"TA"},
-   AMDGPU_GFX_TA_MEM, 8},
+   AMDGPU_GFX_TA_MEM, 4},
{{AMDGPU_RAS_REG_ENTRY(GC, 0, regTCI_CE_EDC_LO_REG, 
regTCI_CE_EDC_HI_REG),
-   31, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), 
"TCI"},
+   27, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), 
"TCI"},
AMDGPU_GFX_TCI_MEM, 1},
{{AMDGPU_RAS_REG_ENTRY(GC, 0, regTCP_CE_EDC_LO_REG, 
regTCP_CE_EDC_HI_REG),
10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), 
"TCP"},
-   AMDGPU_GFX_TCP_MEM, 8},
+   AMDGPU_GFX_TCP_MEM, 4},
{{AMDGPU_RAS_REG_ENTRY(GC, 0, regTD_CE_EDC_LO, regTD_CE_EDC_HI),
10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), 
"TD"},
-   AMDGPU_GFX_TD_MEM, 8},
+   AMDGPU_GFX_TD_MEM, 4},
{{AMDGPU_RAS_REG_ENTRY(GC, 0, regGCEA_CE_ERR_STATUS_LO, 
regGCEA_CE_ERR_STATUS_HI),
16, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), 
"GCEA"},
AMDGPU_GFX_GCEA_MEM, 1},
{{AMDGPU_RAS_REG_ENTRY(GC, 0, regLDS_CE_ERR_STATUS_LO, 
regLDS_CE_ERR_STATUS_HI),
10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), 
"LDS"},
-   AMDGPU_GFX_LDS_MEM, 1},
+   AMDGPU_GFX_LDS_MEM, 4},
 };
 
 static const struct amdgpu_gfx_ras_reg_entry gfx_v9_4_3_ue_reg_list[] = {
@@ -3713,19 +3713,19 @@ static const struct amdgpu_gfx_ras_reg_entry 
gfx_v9_4_3_ue_reg_list[] = {
AMDGPU_GFX_GC_CANE_MEM, 1},
{{AMDGPU_RAS_REG_ENTRY(GC, 0, regSPI_UE_ERR_STATUS_LO, 
regSPI_UE_ERR_STATUS_HI),
1, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), 
"SPI"},
-   AMDGPU_GFX_SPI_MEM, 8},
+   AMDGPU_GFX_SPI_MEM, 1},
{{AMDGPU_RAS_REG_ENTRY(GC, 0, regSP0_UE_ERR_STATUS_LO, 
regSP0_UE_ERR_STATUS_HI),
10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), 
"SP0"},
-   AMDGPU_GFX_SP_MEM, 1},
+   AMDGPU_GFX_SP_MEM, 4},
{{AMDGPU_RAS_REG_ENTRY(GC, 0, regSP1_UE_ERR_STATUS_LO, 
regSP1_UE_ERR_STATUS_HI),
10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATUS_VALID), 
"SP1"},
-   AMDGPU_GFX_SP_MEM, 1},
+   AMDGPU_GFX_SP_MEM, 4},
{{AMDGPU_RAS_REG_ENTRY(GC, 0, regSQ_UE_ERR_STATUS_LO, 
regSQ_UE_ERR_STATUS_HI),
10, (AMDGPU_RAS_ERR_INFO_VALID | AMDGPU_RAS_ERR_STATU

[PATCH] drm/amdgpu: Free ras cmd input buffer properly

2023-08-29 Thread Hawking Zhang

Do not access the pointer for ras input cmd buffer
if it is even not allocated.

Signed-off-by: Hawking Zhang 
Reviewed-by: Stanley Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index e47600a8e88e..8eb6f6943778 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -764,7 +764,7 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
 {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
union ta_ras_cmd_input *info;
-   int ret = 0;
+   int ret;
 
if (!con)
return -EINVAL;
@@ -774,7 +774,7 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
/* Force issue enable or disable ras feature commands */
if (head->block != AMDGPU_RAS_BLOCK__GFX &&
!amdgpu_ras_is_feature_allowed(adev, head))
-   goto out;
+   return 0;
 
/* Only enable gfx ras feature from host side */
if (head->block == AMDGPU_RAS_BLOCK__GFX &&
@@ -802,16 +802,16 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
enable ? "enable":"disable",
get_ras_block_str(head),
amdgpu_ras_is_poison_mode_supported(adev), ret);
-   goto out;
+   return ret;
}
+
+   kfree(info);
}
 
/* setup the obj */
__amdgpu_ras_feature_enable(adev, head, enable);
-out:
-   if (head->block == AMDGPU_RAS_BLOCK__GFX)
-   kfree(info);
-   return ret;
+
+   return 0;
 }
 
 /* Only used in device probe stage and called only once. */
-- 
2.17.1

[PATCH] drm/amdgpu: Free ras cmd input buffer properly

2023-08-29 Thread Hawking Zhang

Do not access the pointer for ras input cmd buffer
if it is even not allocated.

Signed-off-by: Hawking Zhang 
Reviewed-by: Stanley Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index e47600a8e88e..8eb6f6943778 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -764,7 +764,7 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
 {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
union ta_ras_cmd_input *info;
-   int ret = 0;
+   int ret;
 
if (!con)
return -EINVAL;
@@ -774,7 +774,7 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
/* Force issue enable or disable ras feature commands */
if (head->block != AMDGPU_RAS_BLOCK__GFX &&
!amdgpu_ras_is_feature_allowed(adev, head))
-   goto out;
+   return 0;
 
/* Only enable gfx ras feature from host side */
if (head->block == AMDGPU_RAS_BLOCK__GFX &&
@@ -802,16 +802,16 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
enable ? "enable":"disable",
get_ras_block_str(head),
amdgpu_ras_is_poison_mode_supported(adev), ret);
-   goto out;
+   return ret;
}
+
+   kfree(info);
}
 
/* setup the obj */
__amdgpu_ras_feature_enable(adev, head, enable);
-out:
-   if (head->block == AMDGPU_RAS_BLOCK__GFX)
-   kfree(info);
-   return ret;
+
+   return 0;
 }
 
 /* Only used in device probe stage and called only once. */
-- 
2.17.1

[PATCH] drm/amdgpu: Free ras cmd input buffer properly

2023-08-29 Thread Hawking Zhang

Do not access the pointer for ras input cmd buffer
if it is even not allocated.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index e47600a8e88e..16c5fe487ea0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -804,13 +804,13 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
amdgpu_ras_is_poison_mode_supported(adev), ret);
goto out;
}
+
+   kfree(info);
}
 
/* setup the obj */
__amdgpu_ras_feature_enable(adev, head, enable);
-out:
-   if (head->block == AMDGPU_RAS_BLOCK__GFX)
-   kfree(info);
+
return ret;
 }
 
-- 
2.17.1

[PATCH 2/2] drm/amdgpu: Support query ecc cap for aqua_vanjaram

2023-08-25 Thread Hawking Zhang

Driver queries umc_info v4_0 to identify ecc cap
for aqua_vanjaram

Signed-off-by: Hawking Zhang 
Reviewed-by: Candice Li 
---
 .../gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c   | 18 --
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
index 835980e94b9e..fb2681dd6b33 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
@@ -217,6 +217,7 @@ union umc_info {
struct atom_umc_info_v3_1 v31;
struct atom_umc_info_v3_2 v32;
struct atom_umc_info_v3_3 v33;
+   struct atom_umc_info_v4_0 v40;
 };
 
 union vram_info {
@@ -508,9 +509,8 @@ bool amdgpu_atomfirmware_mem_ecc_supported(struct 
amdgpu_device *adev)
 
if (amdgpu_atom_parse_data_header(mode_info->atom_context,
index, , , , _offset)) {
+   umc_info = (union umc_info *)(mode_info->atom_context->bios + 
data_offset);
if (frev == 3) {
-   umc_info = (union umc_info *)
-   (mode_info->atom_context->bios + data_offset);
switch (crev) {
case 1:
umc_config = 
le32_to_cpu(umc_info->v31.umc_config);
@@ -533,6 +533,20 @@ bool amdgpu_atomfirmware_mem_ecc_supported(struct 
amdgpu_device *adev)
/* unsupported crev */
return false;
}
+   } else if (frev == 4) {
+   switch (crev) {
+   case 0:
+   umc_config1 = 
le32_to_cpu(umc_info->v40.umc_config1);
+   ecc_default_enabled =
+   (umc_config1 & 
UMC_CONFIG1__ENABLE_ECC_CAPABLE) ? true : false;
+   break;
+   default:
+   /* unsupported crev */
+   return false;
+   }
+   } else {
+   /* unsupported frev */
+   return false;
}
}
 
-- 
2.17.1

[PATCH 1/2] drm/amdgpu: Add umc_info v4_0 structure

2023-08-25 Thread Hawking Zhang

To be used by aqua_vanjaram

Signed-off-by: Hawking Zhang 
Reviewed-by: Candice Li 
---
 drivers/gpu/drm/amd/include/atomfirmware.h | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/drivers/gpu/drm/amd/include/atomfirmware.h 
b/drivers/gpu/drm/amd/include/atomfirmware.h
index e68c1e280322..fa7d6ced786f 100644
--- a/drivers/gpu/drm/amd/include/atomfirmware.h
+++ b/drivers/gpu/drm/amd/include/atomfirmware.h
@@ -3117,6 +3117,24 @@ enum atom_umc_config1_def {
UMC_CONFIG1__ENABLE_ECC_CAPABLE = 0x0001,
 };
 
+struct atom_umc_info_v4_0 {
+   struct atom_common_table_header table_header;
+   uint32_t ucode_reserved[5];
+   uint8_t umcip_min_ver;
+   uint8_t umcip_max_ver;
+   uint8_t vram_type;
+   uint8_t umc_config;
+   uint32_t mem_refclk_10khz;
+   uint32_t clk_reserved[4];
+   uint32_t golden_reserved;
+   uint32_t umc_config1;
+   uint32_t reserved[2];
+   uint8_t channel_num;
+   uint8_t channel_width;
+   uint8_t channel_reserve[2];
+   uint8_t umc_info_reserved[16];
+};
+
 /* 
   ***
 Data Table vram_info  structure
-- 
2.17.1

[PATCH] drm/amdgpu: Allow issue disable gfx ras cmd to firmware

2023-08-23 Thread Hawking Zhang

Disable gfx ras command is needed in some use cases
like live migration.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 378478cf9c21..7db6baa16236 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -769,9 +769,10 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
if (!con)
return -EINVAL;
 
-   /* Do not enable ras feature if it is not allowed */
-   if (enable &&
-   head->block != AMDGPU_RAS_BLOCK__GFX &&
+   /* For non-gfx ip, do not enable ras feature if it is not allowed.
+* For gfx ip, regardless of feature support status,
+* force issue enable or disable ras feature commands */
+   if (head->block != AMDGPU_RAS_BLOCK__GFX &&
!amdgpu_ras_is_feature_allowed(adev, head))
goto out;
 
-- 
2.17.1

[PATCH] drm/amdgpu: Fix the return for gpu mode1_reset

2023-08-19 Thread Hawking Zhang

amdgpu_device_mode1_reset will return gpu mode1_reset
succeed (ret = 0) as long as wait_for_bootloader call
succeed, regardless of the status reported by smu or
psp firmware. This results to driver continue executing
recovery even smu or psp fail to perform mode1 reset.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 5586146b8c76..533daba2accb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4701,12 +4701,12 @@ int amdgpu_device_mode1_reset(struct amdgpu_device 
*adev)
}
 
if (ret)
-   dev_err(adev->dev, "GPU mode1 reset failed\n");
+   goto mode1_reset_failed;
 
amdgpu_device_load_pci_state(adev->pdev);
ret = amdgpu_psp_wait_for_bootloader(adev);
if (ret)
-   return ret;
+   goto mode1_reset_failed;
 
/* wait for asic to come out of reset */
for (i = 0; i < adev->usec_timeout; i++) {
@@ -4717,8 +4717,17 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
udelay(1);
}
 
+   if (i >= adev->usec_timeout) {
+   ret = -ETIMEDOUT;
+   goto mode1_reset_failed;
+   }
+
amdgpu_atombios_scratch_regs_engine_hung(adev, false);
 
+   return 0;
+
+mode1_reset_failed:
+   dev_err(adev->dev, "GPU mode1 reset failed\n");
return ret;
 }
 
-- 
2.17.1

[PATCH] drm/amdgpu: Remove unnecessary ras cap check

2023-08-09 Thread Hawking Zhang

RAS global isr will only be invoked by hardware
interrupt. Don't need to query ras capability in isr
In addition, amdgpu_ras_interrupt_fatal_error_handler
ensures the isr won't be called from guest linux
side by accident. The RAS cap check in isr that
introduced to fix sriov crash is not needed any more

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 00658c2816dc..c58b31121fd7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2970,10 +2970,6 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
 
 void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
 {
-   amdgpu_ras_check_supported(adev);
-   if (!adev->ras_hw_enabled)
-   return;
-
if (atomic_cmpxchg(_ras_in_intr, 0, 1) == 0) {
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
-- 
2.17.1

[PATCH 3/3] drm/amdgpu: Issue ras enable_feature for gfx ip only

2023-07-03 Thread Hawking Zhang

For non-GFX IP blocks, set up ras obj if ras feature
is allowed. For GFX IP blocks, force issue ras
enable_feature command to firmware and only set up ras
obj if ras feature is allowed

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 30 +
 1 file changed, 10 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 8524365761b6..2e9154bbec64 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -761,16 +761,6 @@ static int __amdgpu_ras_feature_enable(struct 
amdgpu_device *adev,
return 0;
 }
 
-static int amdgpu_ras_check_feature_allowed(struct amdgpu_device *adev,
-   struct ras_common_if *head)
-{
-   if (amdgpu_ras_is_feature_allowed(adev, head) ||
-   amdgpu_ras_is_poison_mode_supported(adev))
-   return 1;
-   else
-   return 0;
-}
-
 /* wrapper of psp_ras_enable_features */
 int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
struct ras_common_if *head, bool enable)
@@ -782,7 +772,16 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
if (!con)
return -EINVAL;
 
-   if (head->block == AMDGPU_RAS_BLOCK__GFX) {
+   /* Do not enable ras feature if it is not allowed */
+   if (enable &&
+   head->block != AMDGPU_RAS_BLOCK__GFX &&
+   !amdgpu_ras_is_feature_allowed(adev, head))
+   goto out;
+
+   /* Only enable gfx ras feature from host side */
+   if (head->block == AMDGPU_RAS_BLOCK__GFX &&
+   !amdgpu_sriov_vf(adev) &&
+   !amdgpu_ras_intr_triggered()) {
info = kzalloc(sizeof(union ta_ras_cmd_input), GFP_KERNEL);
if (!info)
return -ENOMEM;
@@ -798,16 +797,7 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
.error_type = 
amdgpu_ras_error_to_ta(head->type),
};
}
-   }
 
-   /* Do not enable if it is not allowed. */
-   if (enable && !amdgpu_ras_check_feature_allowed(adev, head))
-   goto out;
-
-   /* Only enable ras feature operation handle on host side */
-   if (head->block == AMDGPU_RAS_BLOCK__GFX &&
-   !amdgpu_sriov_vf(adev) &&
-   !amdgpu_ras_intr_triggered()) {
ret = psp_ras_enable_features(>psp, info, enable);
if (ret) {
dev_err(adev->dev, "ras %s %s failed poison:%d 
ret:%d\n",
-- 
2.17.1

[PATCH 2/3] drm/amdgpu: Remove gfx v11_0_3 ras_late_init call

2023-07-03 Thread Hawking Zhang

amdgpu_ras_late_init will invoke ras_late_init call
per IP block

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 26 --
 1 file changed, 26 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index 3a7af59e83ca..66d38890d393 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -4650,26 +4650,6 @@ static int gfx_v11_0_early_init(void *handle)
return gfx_v11_0_init_microcode(adev);
 }
 
-static int gfx_v11_0_ras_late_init(void *handle)
-{
-   struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-   struct ras_common_if *gfx_common_if;
-   int ret;
-
-   gfx_common_if = kzalloc(sizeof(struct ras_common_if), GFP_KERNEL);
-   if (!gfx_common_if)
-   return -ENOMEM;
-
-   gfx_common_if->block = AMDGPU_RAS_BLOCK__GFX;
-
-   ret = amdgpu_ras_feature_enable(adev, gfx_common_if, true);
-   if (ret)
-   dev_warn(adev->dev, "Failed to enable gfx11 ras feature\n");
-
-   kfree(gfx_common_if);
-   return 0;
-}
-
 static int gfx_v11_0_late_init(void *handle)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
@@ -4683,12 +4663,6 @@ static int gfx_v11_0_late_init(void *handle)
if (r)
return r;
 
-   if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3)) {
-   r = gfx_v11_0_ras_late_init(handle);
-   if (r)
-   return r;
-   }
-
return 0;
 }
 
-- 
2.17.1

[PATCH 1/3] drm/amdgpu: Apply poison mode check to GFX IP only

2023-07-03 Thread Hawking Zhang

For GFX IP that only supports poison consumption, GFX
RAS won't be marked as enabled. i.e., hardware doesn't
support gfx sram ecc. But driver still needs to issue
firmware to enable poison consumption mode for GFX IP.
In such case, check poison mode and treat GFX IP as
RAS capable IP block.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 041112c7fbbd..8524365761b6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3150,6 +3150,7 @@ int amdgpu_ras_is_supported(struct amdgpu_device *adev,
 * that the ras block supports ras function.
 */
if (!ret &&
+   block == AMDGPU_RAS_BLOCK__GFX &&
amdgpu_ras_is_poison_mode_supported(adev) &&
amdgpu_ras_get_ras_block(adev, block, 0))
ret = 1;
-- 
2.17.1

[PATCH 2/2] drm/amdgpu: Enable gfx v11_0_3 ras if poison mode is supported

2023-06-11 Thread Hawking Zhang

GFX v11_0_3 ras needs to be enabled if poison mode
is supported. Driver doesn't need issue an feature
enable call in gfx_v11_0 late init phase. The ras
late init call is already centralized to
amdgpu_ras_late_init.
In addition, move poison_mode check out of common
helper like amdgpu_ras_is_supported and
amdgpu_ras_is_feature_allowed ensure only GFX RAS
is enabled when poison mode is supported.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 49 -
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c  | 26 -
 2 files changed, 16 insertions(+), 59 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index dd7cdc234d7e..35e70860d628 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -126,6 +126,7 @@ static bool amdgpu_ras_check_bad_page_unlock(struct 
amdgpu_ras *con,
uint64_t addr);
 static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
uint64_t addr);
+static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev);
 #ifdef CONFIG_X86_MCE_AMD
 static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev);
 struct mce_notifier_adev_list {
@@ -757,16 +758,6 @@ static int __amdgpu_ras_feature_enable(struct 
amdgpu_device *adev,
return 0;
 }
 
-static int amdgpu_ras_check_feature_allowed(struct amdgpu_device *adev,
-   struct ras_common_if *head)
-{
-   if (amdgpu_ras_is_feature_allowed(adev, head) ||
-   amdgpu_ras_is_poison_mode_supported(adev))
-   return 1;
-   else
-   return 0;
-}
-
 /* wrapper of psp_ras_enable_features */
 int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
struct ras_common_if *head, bool enable)
@@ -797,7 +788,7 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
}
 
/* Do not enable if it is not allowed. */
-   if (enable && !amdgpu_ras_check_feature_allowed(adev, head))
+   if (enable && !amdgpu_ras_is_feature_allowed(adev, head))
goto out;
 
/* Only enable ras feature operation handle on host side */
@@ -2420,9 +2411,9 @@ static bool amdgpu_ras_asic_supported(struct 
amdgpu_device *adev)
 }
 
 /*
- * this is workaround for vega20 workstation sku,
- * force enable gfx ras, ignore vbios gfx ras flag
- * due to GC EDC can not write
+ * Common helpers for device or IP specific RAS quirks including
+ * a). Enable gfx ras on D16406 or D36002 board
+ * b). Enable gfx ras in gfx_v11_0_3 if poison mode is supported
  */
 static void amdgpu_ras_get_quirks(struct amdgpu_device *adev)
 {
@@ -2431,10 +2422,16 @@ static void amdgpu_ras_get_quirks(struct amdgpu_device 
*adev)
if (!ctx)
return;
 
+   /* Enable gfx ras on specific board */
if (strnstr(ctx->vbios_version, "D16406",
sizeof(ctx->vbios_version)) ||
-   strnstr(ctx->vbios_version, "D36002",
-   sizeof(ctx->vbios_version)))
+   strnstr(ctx->vbios_version, "D36002",
+   sizeof(ctx->vbios_version)))
+   adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX);
+
+   /* Enable gfx ras on gfx_v11_0_3 if poison mode is supported */
+   if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3) &&
+   amdgpu_ras_is_poison_mode_supported(adev))
adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX);
 }
 
@@ -2502,6 +2499,8 @@ static void amdgpu_ras_check_supported(struct 
amdgpu_device *adev)
   1 << AMDGPU_RAS_BLOCK__MMHUB);
}
 
+   amdgpu_ras_query_poison_mode(adev);
+
amdgpu_ras_get_quirks(adev);
 
/* hw_supported needs to be aligned with RAS block mask. */
@@ -2659,8 +2658,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
goto release_con;
}
 
-   amdgpu_ras_query_poison_mode(adev);
-
if (amdgpu_ras_fs_init(adev)) {
r = -EINVAL;
goto release_con;
@@ -3115,26 +3112,12 @@ int amdgpu_ras_set_context(struct amdgpu_device *adev, 
struct amdgpu_ras *ras_co
 int amdgpu_ras_is_supported(struct amdgpu_device *adev,
unsigned int block)
 {
-   int ret = 0;
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
if (block >= AMDGPU_RAS_BLOCK_COUNT)
return 0;
 
-   ret = ras && (adev->ras_enabled & (1 << block));
-
-   /* For the special asic with mem ecc enabled but sram ecc
-* not enabled, even if the ras block is not supported on
-* .ras_enabled, if the asic supports poison mode and the
-* ras block has ras configuration, it can be considered
-* tha

[PATCH 1/2] drm/amdgpu: Only create err_count sysfs when hw_op is supported

2023-06-11 Thread Hawking Zhang

Some IP blocks only support partial ras feature and don't
have ras counter and/or ras error status register at all.
Driver should not create err_count sysfs node for those
IP blocks.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 31 ++---
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index a6c3265cdbc4..dd7cdc234d7e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2757,23 +2757,28 @@ int amdgpu_ras_block_late_init(struct amdgpu_device 
*adev,
goto cleanup;
}
 
-   r = amdgpu_ras_sysfs_create(adev, ras_block);
-   if (r)
-   goto interrupt;
+   if (ras_obj->hw_ops &&
+   (ras_obj->hw_ops->query_ras_error_count ||
+ras_obj->hw_ops->query_ras_error_status)) {
+   r = amdgpu_ras_sysfs_create(adev, ras_block);
+   if (r)
+   goto interrupt;
 
-   /* Those are the cached values at init.
-*/
-   query_info = kzalloc(sizeof(struct ras_query_if), GFP_KERNEL);
-   if (!query_info)
-   return -ENOMEM;
-   memcpy(_info->head, ras_block, sizeof(struct ras_common_if));
+   /* Those are the cached values at init.
+*/
+   query_info = kzalloc(sizeof(struct ras_query_if), GFP_KERNEL);
+   if (!query_info)
+   return -ENOMEM;
+   memcpy(_info->head, ras_block, sizeof(struct 
ras_common_if));
 
-   if (amdgpu_ras_query_error_count(adev, _count, _count, 
query_info) == 0) {
-   atomic_set(>ras_ce_count, ce_count);
-   atomic_set(>ras_ue_count, ue_count);
+   if (amdgpu_ras_query_error_count(adev, _count, _count, 
query_info) == 0) {
+   atomic_set(>ras_ce_count, ce_count);
+   atomic_set(>ras_ue_count, ue_count);
+   }
+
+   kfree(query_info);
}
 
-   kfree(query_info);
return 0;
 
 interrupt:
-- 
2.17.1

[PATCH] drm/amd/pm: Fix power context allocation in SMU13

2023-06-01 Thread Hawking Zhang

From: Lijo Lazar 

Use the right data structure for allocation.

Signed-off-by: Lijo Lazar 
Reviewed-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
index da059b02a153..09ac66ab9c34 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
@@ -534,11 +534,11 @@ int smu_v13_0_init_power(struct smu_context *smu)
if (smu_power->power_context || smu_power->power_context_size != 0)
return -EINVAL;
 
-   smu_power->power_context = kzalloc(sizeof(struct smu_13_0_dpm_context),
+   smu_power->power_context = kzalloc(sizeof(struct 
smu_13_0_power_context),
   GFP_KERNEL);
if (!smu_power->power_context)
return -ENOMEM;
-   smu_power->power_context_size = sizeof(struct smu_13_0_dpm_context);
+   smu_power->power_context_size = sizeof(struct smu_13_0_power_context);
 
return 0;
 }
-- 
2.17.1

[PATCH] drm/amdgpu: drop temp programming for pagefault handling

2023-04-12 Thread Hawking Zhang

Was introduced as workaround. not needed anymore

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/gfxhub_v3_0.c | 22 --
 1 file changed, 22 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v3_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfxhub_v3_0.c
index be0d0f47415e..13712640fa46 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v3_0.c
@@ -417,34 +417,12 @@ static void gfxhub_v3_0_set_fault_enable_default(struct 
amdgpu_device *adev,
tmp = REG_SET_FIELD(tmp, CP_DEBUG, CPG_UTCL1_ERROR_HALT_DISABLE, 1);
WREG32_SOC15(GC, 0, regCP_DEBUG, tmp);
 
-   /**
-* Set GRBM_GFX_INDEX in broad cast mode
-* before programming GL1C_UTCL0_CNTL1 and SQG_CONFIG
-*/
-   WREG32_SOC15(GC, 0, regGRBM_GFX_INDEX, regGRBM_GFX_INDEX_DEFAULT);
-
-   /**
-* Retry respond mode: RETRY
-* Error (no retry) respond mode: SUCCESS
-*/
-   tmp = RREG32_SOC15(GC, 0, regGL1C_UTCL0_CNTL1);
-   tmp = REG_SET_FIELD(tmp, GL1C_UTCL0_CNTL1, RESP_MODE, 0);
-   tmp = REG_SET_FIELD(tmp, GL1C_UTCL0_CNTL1, RESP_FAULT_MODE, 0x2);
-   WREG32_SOC15(GC, 0, regGL1C_UTCL0_CNTL1, tmp);
-
/* These registers are not accessible to VF-SRIOV.
 * The PF will program them instead.
 */
if (amdgpu_sriov_vf(adev))
return;
 
-   /* Disable SQ XNACK interrupt for all VMIDs */
-   tmp = RREG32_SOC15(GC, 0, regSQG_CONFIG);
-   tmp = REG_SET_FIELD(tmp, SQG_CONFIG, XNACK_INTR_MASK,
-   SQG_CONFIG__XNACK_INTR_MASK_MASK >>
-   SQG_CONFIG__XNACK_INTR_MASK__SHIFT);
-   WREG32_SOC15(GC, 0, regSQG_CONFIG, tmp);
-
tmp = RREG32_SOC15(GC, 0, regGCVM_L2_PROTECTION_FAULT_CNTL);
tmp = REG_SET_FIELD(tmp, GCVM_L2_PROTECTION_FAULT_CNTL,
RANGE_PROTECTION_FAULT_ENABLE_DEFAULT, value);
-- 
2.34.1

[PATCH] drm/amdgpu: correct xgmi_wafl block name

2023-03-28 Thread Hawking Zhang

fix backward compatibility issue to stay with
the old name of xgmi_wafl node.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 3fe24348d199..439925477fb8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -1068,7 +1068,7 @@ int amdgpu_xgmi_ras_sw_init(struct amdgpu_device *adev)
return err;
}
 
-   strcpy(ras->ras_block.ras_comm.name, "xgmi_wafl_pcs");
+   strcpy(ras->ras_block.ras_comm.name, "xgmi_wafl");
ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__XGMI_WAFL;
ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
adev->gmc.xgmi.ras_if = >ras_block.ras_comm;
-- 
2.17.1

[PATCH] drm/amdgpu: Add fatal error handling in nbio v4_3

2023-03-22 Thread Hawking Zhang

GPU will stop working once fatal error is detected.
it will inform driver to do reset to recover from
the fatal error.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 11 
 drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c  | 79 +
 drivers/gpu/drm/amd/amdgpu/nbio_v4_3.h  |  1 +
 drivers/gpu/drm/amd/amdgpu/soc21.c  | 15 -
 4 files changed, 105 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index c6dc3cd2a9de..5b1779021881 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -34,6 +34,7 @@
 #include "amdgpu_atomfirmware.h"
 #include "amdgpu_xgmi.h"
 #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
+#include "nbio_v4_3.h"
 #include "atom.h"
 #include "amdgpu_reset.h"
 
@@ -2562,6 +2563,16 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
if (!adev->gmc.xgmi.connected_to_cpu)
adev->nbio.ras = _v7_4_ras;
break;
+   case IP_VERSION(4, 3, 0):
+   if (adev->ras_hw_enabled | AMDGPU_RAS_BLOCK__DF)
+   /* unlike other generation of nbio ras,
+* nbio v4_3 only support fatal error interrupt
+* to inform software that DF is freezed due to
+* system fatal error event. driver should not
+* enable nbio ras in such case. Instead,
+* check DF RAS */
+   adev->nbio.ras = _v4_3_ras;
+   break;
default:
/* nbio ras is not available */
break;
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c 
b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c
index 09fdcd20cb91..d5ed9e0e1a5f 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v4_3.c
@@ -26,6 +26,7 @@
 
 #include "nbio/nbio_4_3_0_offset.h"
 #include "nbio/nbio_4_3_0_sh_mask.h"
+#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
 #include 
 
 static void nbio_v4_3_remap_hdp_registers(struct amdgpu_device *adev)
@@ -538,3 +539,81 @@ const struct amdgpu_nbio_funcs nbio_v4_3_sriov_funcs = {
.remap_hdp_registers = nbio_v4_3_remap_hdp_registers,
.get_rom_offset = nbio_v4_3_get_rom_offset,
 };
+
+static int nbio_v4_3_set_ras_err_event_athub_irq_state(struct amdgpu_device 
*adev,
+  struct amdgpu_irq_src 
*src,
+  unsigned type,
+  enum 
amdgpu_interrupt_state state)
+{
+   /* The ras_controller_irq enablement should be done in psp bl when it
+* tries to enable ras feature. Driver only need to set the correct 
interrupt
+* vector for bare-metal and sriov use case respectively
+*/
+   uint32_t bif_doorbell_int_cntl;
+
+   bif_doorbell_int_cntl = RREG32_SOC15(NBIO, 0, 
regBIF_BX0_BIF_DOORBELL_INT_CNTL);
+   bif_doorbell_int_cntl = REG_SET_FIELD(bif_doorbell_int_cntl,
+ BIF_BX0_BIF_DOORBELL_INT_CNTL,
+ 
RAS_ATHUB_ERR_EVENT_INTERRUPT_DISABLE,
+ (state == 
AMDGPU_IRQ_STATE_ENABLE) ? 0 : 1);
+   WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, 
bif_doorbell_int_cntl);
+
+   return 0;
+}
+
+static int nbio_v4_3_process_err_event_athub_irq(struct amdgpu_device *adev,
+struct amdgpu_irq_src *source,
+struct amdgpu_iv_entry *entry)
+{
+   /* By design, the ih cookie for err_event_athub_irq should be written
+* to bif ring. since bif ring is not enabled, just leave process 
callback
+* as a dummy one.
+*/
+   return 0;
+}
+
+static const struct amdgpu_irq_src_funcs 
nbio_v4_3_ras_err_event_athub_irq_funcs = {
+   .set = nbio_v4_3_set_ras_err_event_athub_irq_state,
+   .process = nbio_v4_3_process_err_event_athub_irq,
+};
+
+static void nbio_v4_3_handle_ras_err_event_athub_intr_no_bifring(struct 
amdgpu_device *adev)
+{
+   uint32_t bif_doorbell_int_cntl;
+
+   bif_doorbell_int_cntl = RREG32_SOC15(NBIO, 0, 
regBIF_BX0_BIF_DOORBELL_INT_CNTL);
+   if (REG_GET_FIELD(bif_doorbell_int_cntl,
+ BIF_DOORBELL_INT_CNTL,
+ RAS_ATHUB_ERR_EVENT_INTERRUPT_STATUS)) {
+   /* driver has to clear the interrupt status when bif ring is 
disabled */
+   bif_doorbell_int_cntl = REG_SET_FIELD(bif_doorbell_int_cntl,
+   BIF_DOORBELL_INT_CNTL,
+   
RAS_ATHUB_ERR_EVENT_INTERRUPT_CLE

[PATCH] drm/amdgpu: Initialize umc ras callback

2023-03-20 Thread Hawking Zhang

Fix a coding error which results to null interrupt
handler for umc ras.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index da68ceaa024c..9e2e97207e53 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -232,7 +232,7 @@ int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev)
if (!ras->ras_block.ras_late_init)
ras->ras_block.ras_late_init = amdgpu_umc_ras_late_init;
 
-   if (ras->ras_block.ras_cb)
+   if (!ras->ras_block.ras_cb)
ras->ras_block.ras_cb = amdgpu_umc_process_ras_data_cb;
 
return 0;
-- 
2.17.1

[PATCH 10/10] drm/amdgpu: drop ras check at asic level for new blocks

2023-03-12 Thread Hawking Zhang

amdgpu_ras_register_ras_block should always be invoked
by ras_sw_init, where driver needs to check ras caps
at ip level, instead of asic level.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 834092099bff..c34f51be793c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3076,9 +3076,6 @@ int amdgpu_ras_register_ras_block(struct amdgpu_device 
*adev,
if (!adev || !ras_block_obj)
return -EINVAL;
 
-   if (!amdgpu_ras_asic_supported(adev))
-   return 0;
-
ras_node = kzalloc(sizeof(*ras_node), GFP_KERNEL);
if (!ras_node)
return -ENOMEM;
-- 
2.17.1

[PATCH 08/10] drm/amdgpu: Rework xgmi_wafl_pcs ras sw_init

2023-03-12 Thread Hawking Zhang

To align with other IP blocks.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c  |  9 
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 28 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h |  1 +
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c|  7 ++
 4 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index ab85b85496f2..a407357cb153 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -478,11 +478,10 @@ int amdgpu_gmc_ras_sw_init(struct amdgpu_device *adev)
if (r)
return r;
 
-   if (!adev->gmc.xgmi.connected_to_cpu) {
-   adev->gmc.xgmi.ras = _ras;
-   amdgpu_ras_register_ras_block(adev, 
>gmc.xgmi.ras->ras_block);
-   adev->gmc.xgmi.ras_if = >gmc.xgmi.ras->ras_block.ras_comm;
-   }
+   /* xgmi ras block */
+   r = amdgpu_xgmi_ras_sw_init(adev);
+   if (r)
+   return r;
 
return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index fef1575cd0cf..3fe24348d199 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -1048,12 +1048,30 @@ struct amdgpu_ras_block_hw_ops  xgmi_ras_hw_ops = {
 
 struct amdgpu_xgmi_ras xgmi_ras = {
.ras_block = {
-   .ras_comm = {
-   .name = "xgmi_wafl",
-   .block = AMDGPU_RAS_BLOCK__XGMI_WAFL,
-   .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
-   },
.hw_ops = _ras_hw_ops,
.ras_late_init = amdgpu_xgmi_ras_late_init,
},
 };
+
+int amdgpu_xgmi_ras_sw_init(struct amdgpu_device *adev)
+{
+   int err;
+   struct amdgpu_xgmi_ras *ras;
+
+   if (!adev->gmc.xgmi.ras)
+   return 0;
+
+   ras = adev->gmc.xgmi.ras;
+   err = amdgpu_ras_register_ras_block(adev, >ras_block);
+   if (err) {
+   dev_err(adev->dev, "Failed to register xgmi_wafl_pcs ras 
block!\n");
+   return err;
+   }
+
+   strcpy(ras->ras_block.ras_comm.name, "xgmi_wafl_pcs");
+   ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__XGMI_WAFL;
+   ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+   adev->gmc.xgmi.ras_if = >ras_block.ras_comm;
+
+   return 0;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index 30dcc1681b4e..86fbf56938f4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -73,5 +73,6 @@ static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device 
*adev,
adev->gmc.xgmi.hive_id &&
adev->gmc.xgmi.hive_id == bo_adev->gmc.xgmi.hive_id);
 }
+int amdgpu_xgmi_ras_sw_init(struct amdgpu_device *adev);
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 67c2a5186b8a..2a8dc9b52c2d 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1381,6 +1381,12 @@ static void gmc_v9_0_set_mca_ras_funcs(struct 
amdgpu_device *adev)
}
 }
 
+static void gmc_v9_0_set_xgmi_ras_funcs(struct amdgpu_device *adev)
+{
+   if (!adev->gmc.xgmi.connected_to_cpu)
+   adev->gmc.xgmi.ras = _ras;
+}
+
 static int gmc_v9_0_early_init(void *handle)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
@@ -1404,6 +1410,7 @@ static int gmc_v9_0_early_init(void *handle)
gmc_v9_0_set_gfxhub_funcs(adev);
gmc_v9_0_set_hdp_ras_funcs(adev);
gmc_v9_0_set_mca_ras_funcs(adev);
+   gmc_v9_0_set_xgmi_ras_funcs(adev);
 
adev->gmc.shared_aperture_start = 0x2000ULL;
adev->gmc.shared_aperture_end =
-- 
2.17.1

[PATCH 07/10] drm/amdgpu: Rework mca ras sw_init

2023-03-12 Thread Hawking Zhang

To align with other IP blocks

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 13 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c | 72 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h |  9 ++--
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   | 15 +++---
 drivers/gpu/drm/amd/amdgpu/mca_v3_0.c   | 44 ++-
 drivers/gpu/drm/amd/amdgpu/mca_v3_0.h   |  4 +-
 6 files changed, 103 insertions(+), 54 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 551884dc5245..ab85b85496f2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -465,6 +465,19 @@ int amdgpu_gmc_ras_sw_init(struct amdgpu_device *adev)
if (r)
return r;
 
+   /* mca.x ras block */
+   r = amdgpu_mca_mp0_ras_sw_init(adev);
+   if (r)
+   return r;
+
+   r = amdgpu_mca_mp1_ras_sw_init(adev);
+   if (r)
+   return r;
+
+   r = amdgpu_mca_mpio_ras_sw_init(adev);
+   if (r)
+   return r;
+
if (!adev->gmc.xgmi.connected_to_cpu) {
adev->gmc.xgmi.ras = _ras;
amdgpu_ras_register_ras_block(adev, 
>gmc.xgmi.ras->ras_block);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
index 51c2a82e2fa4..0b545bdcd636 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
@@ -70,3 +70,75 @@ void amdgpu_mca_query_ras_error_count(struct amdgpu_device 
*adev,
 
amdgpu_mca_reset_error_count(adev, mc_status_addr);
 }
+
+int amdgpu_mca_mp0_ras_sw_init(struct amdgpu_device *adev)
+{
+   int err;
+   struct amdgpu_mca_ras_block *ras;
+
+   if (!adev->mca.mp0.ras)
+   return 0;
+
+   ras = adev->mca.mp0.ras;
+
+   err = amdgpu_ras_register_ras_block(adev, >ras_block);
+   if (err) {
+   dev_err(adev->dev, "Failed to register mca.mp0 ras block!\n");
+   return err;
+   }
+
+   strcpy(ras->ras_block.ras_comm.name, "mca.mp0");
+   ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__MCA;
+   ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+   adev->mca.mp0.ras_if = >ras_block.ras_comm;
+
+   return 0;
+}
+
+int amdgpu_mca_mp1_ras_sw_init(struct amdgpu_device *adev)
+{
+int err;
+struct amdgpu_mca_ras_block *ras;
+
+if (!adev->mca.mp1.ras)
+return 0;
+
+ras = adev->mca.mp1.ras;
+
+err = amdgpu_ras_register_ras_block(adev, >ras_block);
+if (err) {
+dev_err(adev->dev, "Failed to register mca.mp1 ras block!\n");
+return err;
+}
+
+strcpy(ras->ras_block.ras_comm.name, "mca.mp1");
+ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__MCA;
+ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+adev->mca.mp1.ras_if = >ras_block.ras_comm;
+
+return 0;
+}
+
+int amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device *adev)
+{
+int err;
+struct amdgpu_mca_ras_block *ras;
+
+if (!adev->mca.mpio.ras)
+return 0;
+
+ras = adev->mca.mpio.ras;
+
+err = amdgpu_ras_register_ras_block(adev, >ras_block);
+if (err) {
+dev_err(adev->dev, "Failed to register mca.mpio ras block!\n");
+return err;
+}
+
+strcpy(ras->ras_block.ras_comm.name, "mca.mpio");
+ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__MCA;
+ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+adev->mca.mpio.ras_if = >ras_block.ras_comm;
+
+return 0;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
index 7ce16d16e34b..997a073e2409 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
@@ -30,12 +30,7 @@ struct amdgpu_mca_ras {
struct amdgpu_mca_ras_block *ras;
 };
 
-struct amdgpu_mca_funcs {
-   void (*init)(struct amdgpu_device *adev);
-};
-
 struct amdgpu_mca {
-   const struct amdgpu_mca_funcs *funcs;
struct amdgpu_mca_ras mp0;
struct amdgpu_mca_ras mp1;
struct amdgpu_mca_ras mpio;
@@ -55,5 +50,7 @@ void amdgpu_mca_reset_error_count(struct amdgpu_device *adev,
 void amdgpu_mca_query_ras_error_count(struct amdgpu_device *adev,
  uint64_t mc_status_addr,
  void *ras_error_status);
-
+int amdgpu_mca_mp0_ras_sw_init(struct amdgpu_device *adev);
+int amdgpu_mca_mp1_ras_sw_init(struct amdgpu_device *adev);
+int amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device *adev);
 #endif
diff --git a/drivers/gpu/drm

[PATCH 09/10] drm/amdgpu: Rework pcie_bif ras sw_init

2023-03-12 Thread Hawking Zhang

pcie_bif ras blocks needs to be initialized as early
as possible to handle fatal error detected in hw_init
phase. also align the pcie_bif ras sw_init with other
ras blocks

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c | 23 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  | 17 ++---
 3 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
index 37d779b8e4a6..a3bc00577a7c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
@@ -22,6 +22,29 @@
 #include "amdgpu.h"
 #include "amdgpu_ras.h"
 
+int amdgpu_nbio_ras_sw_init(struct amdgpu_device *adev)
+{
+   int err;
+   struct amdgpu_nbio_ras *ras;
+
+   if (!adev->nbio.ras)
+   return 0;
+
+   ras = adev->nbio.ras;
+   err = amdgpu_ras_register_ras_block(adev, >ras_block);
+   if (err) {
+   dev_err(adev->dev, "Failed to register pcie_bif ras block!\n");
+   return err;
+   }
+
+   strcpy(ras->ras_block.ras_comm.name, "pcie_bif");
+   ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__PCIE_BIF;
+   ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+   adev->nbio.ras_if = >ras_block.ras_comm;
+
+   return 0;
+}
+
 int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev, struct ras_common_if 
*ras_block)
 {
int r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
index a240336bbc6b..c686ff4bcc39 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
@@ -106,5 +106,6 @@ struct amdgpu_nbio {
struct amdgpu_nbio_ras  *ras;
 };
 
+int amdgpu_nbio_ras_sw_init(struct amdgpu_device *adev);
 int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev, struct ras_common_if 
*ras_block);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 63dfcc98152d..834092099bff 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2555,20 +2555,23 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
 * ras functions so hardware fatal error interrupt
 * can be enabled as early as possible */
switch (adev->asic_type) {
-   case CHIP_VEGA20:
-   case CHIP_ARCTURUS:
-   case CHIP_ALDEBARAN:
-   if (!adev->gmc.xgmi.connected_to_cpu) {
+   case IP_VERSION(7, 4, 0):
+   case IP_VERSION(7, 4, 1):
+   case IP_VERSION(7, 4, 4):
+   if (!adev->gmc.xgmi.connected_to_cpu)
adev->nbio.ras = _v7_4_ras;
-   amdgpu_ras_register_ras_block(adev, 
>nbio.ras->ras_block);
-   adev->nbio.ras_if = >nbio.ras->ras_block.ras_comm;
-   }
break;
default:
/* nbio ras is not available */
break;
}
 
+   /* nbio ras block needs to be enabled ahead of other ras blocks
+* to handle fatal error */
+   r = amdgpu_nbio_ras_sw_init(adev);
+   if (r)
+   return r;
+
if (adev->nbio.ras &&
adev->nbio.ras->init_ras_controller_interrupt) {
r = adev->nbio.ras->init_ras_controller_interrupt(adev);
-- 
2.17.1

[PATCH 06/10] drm/amdgpu: Move hdp ras block init to ras sw_init

2023-03-12 Thread Hawking Zhang

Initialize hdp ras block only when mmhub ip block
supports ras features. Driver queries ras capabilities
after early_init, ras block init needs to be moved to
sw_init.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/Makefile |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c |  5 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c | 48 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h |  2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  2 --
 drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c   |  5 ---
 6 files changed, 55 insertions(+), 9 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
b/drivers/gpu/drm/amd/amdgpu/Makefile
index 00c33ce38761..5f9ac1bcb6bc 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -54,7 +54,7 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \
amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \
amdgpu_gtt_mgr.o amdgpu_preempt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o \
amdgpu_atomfirmware.o amdgpu_vf_error.o amdgpu_sched.o \
-   amdgpu_debugfs.o amdgpu_ids.o amdgpu_gmc.o amdgpu_mmhub.o \
+   amdgpu_debugfs.o amdgpu_ids.o amdgpu_gmc.o amdgpu_mmhub.o amdgpu_hdp.o \
amdgpu_xgmi.o amdgpu_csa.o amdgpu_ras.o amdgpu_vm_cpu.o \
amdgpu_vm_sdma.o amdgpu_discovery.o amdgpu_ras_eeprom.o amdgpu_nbio.o \
amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index a15bc513dd67..551884dc5245 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -460,6 +460,11 @@ int amdgpu_gmc_ras_sw_init(struct amdgpu_device *adev)
if (r)
return r;
 
+   /* hdp ras block */
+   r = amdgpu_hdp_ras_sw_init(adev);
+   if (r)
+   return r;
+
if (!adev->gmc.xgmi.connected_to_cpu) {
adev->gmc.xgmi.ras = _ras;
amdgpu_ras_register_ras_block(adev, 
>gmc.xgmi.ras->ras_block);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c
new file mode 100644
index ..b6cf801939aa
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include "amdgpu.h"
+#include "amdgpu_ras.h"
+
+int amdgpu_hdp_ras_sw_init(struct amdgpu_device *adev)
+{
+   int err;
+   struct amdgpu_hdp_ras *ras;
+
+   if (!adev->hdp.ras)
+   return 0;
+
+   ras = adev->hdp.ras;
+   err = amdgpu_ras_register_ras_block(adev, >ras_block);
+   if (err) {
+   dev_err(adev->dev, "Failed to register hdp ras block!\n");
+   return err;
+   }
+
+   strcpy(ras->ras_block.ras_comm.name, "hdp");
+   ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__HDP;
+   ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+   adev->hdp.ras_if = >ras_block.ras_comm;
+
+   /* hdp ras follows amdgpu_ras_block_late_init_default for late init */
+   return 0;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h
index ac5c61d3de2b..7b8a6152dc8d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h
@@ -43,5 +43,5 @@ struct amdgpu_hdp {
struct amdgpu_hdp_ras   *ras;
 };
 
-int amdgpu_hdp_ras_late_init(struct amdgpu_device *adev, struct ras_common_if 
*ras_block);
+int amdgpu_hdp_ras_sw_init(struct amdgpu_device *adev);
 #endif /* __AMDGPU_HDP_H__ */
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index b3bb7021012

[PATCH 05/10] drm/amdgpu: Move mmhub ras block init to ras sw_init

2023-03-12 Thread Hawking Zhang

Initialize mmhub ras block only when mmhub ip block
supports ras features. Driver queries ras capabilities
after early_init, ras block init needs to be moved to
sw_init.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/Makefile   |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c   |  5 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c | 46 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h |  2 +
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c |  9 -
 5 files changed, 54 insertions(+), 10 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
b/drivers/gpu/drm/amd/amdgpu/Makefile
index d4dfd48451ce..00c33ce38761 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -54,7 +54,7 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \
amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \
amdgpu_gtt_mgr.o amdgpu_preempt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o \
amdgpu_atomfirmware.o amdgpu_vf_error.o amdgpu_sched.o \
-   amdgpu_debugfs.o amdgpu_ids.o amdgpu_gmc.o \
+   amdgpu_debugfs.o amdgpu_ids.o amdgpu_gmc.o amdgpu_mmhub.o \
amdgpu_xgmi.o amdgpu_csa.o amdgpu_ras.o amdgpu_vm_cpu.o \
amdgpu_vm_sdma.o amdgpu_discovery.o amdgpu_ras_eeprom.o amdgpu_nbio.o \
amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index c764b57957e8..a15bc513dd67 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -455,6 +455,11 @@ int amdgpu_gmc_ras_sw_init(struct amdgpu_device *adev)
if (r)
return r;
 
+   /* mmhub ras block */
+   r = amdgpu_mmhub_ras_sw_init(adev);
+   if (r)
+   return r;
+
if (!adev->gmc.xgmi.connected_to_cpu) {
adev->gmc.xgmi.ras = _ras;
amdgpu_ras_register_ras_block(adev, 
>gmc.xgmi.ras->ras_block);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c
new file mode 100644
index ..0f6b1021fef3
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2023  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "amdgpu.h"
+#include "amdgpu_ras.h"
+
+int amdgpu_mmhub_ras_sw_init(struct amdgpu_device *adev)
+{
+   int err;
+   struct amdgpu_mmhub_ras *ras;
+
+   if (!adev->mmhub.ras)
+   return 0;
+
+   ras = adev->mmhub.ras;
+   err = amdgpu_ras_register_ras_block(adev, >ras_block);
+   if (err) {
+   dev_err(adev->dev, "Failed to register mmhub ras block!\n");
+   return err;
+   }
+
+   strcpy(ras->ras_block.ras_comm.name, "mmhub");
+   ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__MMHUB;
+   ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+   adev->mmhub.ras_if = >ras_block.ras_comm;
+
+   /* mmhub ras follows amdgpu_ras_block_late_init_default for late init */
+   return 0;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
index 93430d3823c9..d21bb6dae56e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
@@ -48,5 +48,7 @@ struct amdgpu_mmhub {
struct amdgpu_mmhub_ras  *ras;
 };
 
+int amdgpu_mmhub_ras_sw_init(struct amdgpu_device *adev);
+
 #endif
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index e9b6599e790c..b3bb70210122 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1351,15 +1351,6 @@ static void gmc_

[PATCH 03/10] drm/amdgpu: Move umc ras block init to gmc ras sw_init

2023-03-12 Thread Hawking Zhang

Initialize umc ras block only when umc ip block
supports ras. Driver queries ras capabilities after
early_init, ras block init needs to be moved to
sw_init.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c |  9 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 30 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |  1 +
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  | 26 -
 drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c  | 21 -
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   | 26 -
 7 files changed, 52 insertions(+), 63 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 6830f671cde7..c764b57957e8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -446,8 +446,15 @@ void amdgpu_gmc_filter_faults_remove(struct amdgpu_device 
*adev, uint64_t addr,
} while (fault->timestamp < tmp);
 }
 
-int amdgpu_gmc_ras_early_init(struct amdgpu_device *adev)
+int amdgpu_gmc_ras_sw_init(struct amdgpu_device *adev)
 {
+   int r;
+
+   /* umc ras block */
+   r = amdgpu_umc_ras_sw_init(adev);
+   if (r)
+   return r;
+
if (!adev->gmc.xgmi.connected_to_cpu) {
adev->gmc.xgmi.ras = _ras;
amdgpu_ras_register_ras_block(adev, 
>gmc.xgmi.ras->ras_block);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index 0305b660cd17..f1773abd5e1a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -351,7 +351,7 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev,
  uint16_t pasid, uint64_t timestamp);
 void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
 uint16_t pasid);
-int amdgpu_gmc_ras_early_init(struct amdgpu_device *adev);
+int amdgpu_gmc_ras_sw_init(struct amdgpu_device *adev);
 int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev);
 void amdgpu_gmc_ras_fini(struct amdgpu_device *adev);
 int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 1b8574bc4463..da68ceaa024c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -208,6 +208,36 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device 
*adev,
return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, 
true);
 }
 
+int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev)
+{
+   int err;
+   struct amdgpu_umc_ras *ras;
+
+   if (!adev->umc.ras)
+   return 0;
+
+   ras = adev->umc.ras;
+
+   err = amdgpu_ras_register_ras_block(adev, >ras_block);
+   if (err) {
+   dev_err(adev->dev, "Failed to register umc ras block!\n");
+   return err;
+   }
+
+   strcpy(adev->umc.ras->ras_block.ras_comm.name, "umc");
+   ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__UMC;
+   ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+   adev->umc.ras_if = >ras_block.ras_comm;
+
+   if (!ras->ras_block.ras_late_init)
+   ras->ras_block.ras_late_init = amdgpu_umc_ras_late_init;
+
+   if (ras->ras_block.ras_cb)
+   ras->ras_block.ras_cb = amdgpu_umc_process_ras_data_cb;
+
+   return 0;
+}
+
 int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if 
*ras_block)
 {
int r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 36e19336f3b3..d7f1229ff11f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -87,6 +87,7 @@ struct amdgpu_umc {
unsigned long active_mask;
 };
 
+int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev);
 int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if 
*ras_block);
 int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset);
 int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index c59c2332d191..924f6f38fae6 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -703,25 +703,8 @@ static void gmc_v10_0_set_umc_funcs(struct amdgpu_device 
*adev)
default:
break;
}
-   if (adev->umc.ras) {
-   amdgpu_ras_register_ras_block(adev, >umc.ras->ras_block);
-
-   strcpy(adev->umc.ras->ras_block.ras_comm.name, "umc");
-   adev->umc.ras->ras_block.ras_comm.block = AMDGPU_

[PATCH 02/10] drm/amdgpu: Move vcn ras block init to ras sw_init

2023-03-12 Thread Hawking Zhang

Initialize vcn ras block only when vcn ip block
supports ras features. Driver queries ras capabilities
after early_init, ras block init needs to be moved to
sw_int.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 29 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  2 +-
 drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c   |  6 +++--
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c   |  6 +++--
 4 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index 02d428ddf2f8..8664a5301b2f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -1158,19 +1158,28 @@ int amdgpu_vcn_process_poison_irq(struct amdgpu_device 
*adev,
return 0;
 }
 
-void amdgpu_vcn_set_ras_funcs(struct amdgpu_device *adev)
+int amdgpu_vcn_ras_sw_init(struct amdgpu_device *adev)
 {
+   int err;
+   struct amdgpu_vcn_ras *ras;
+
if (!adev->vcn.ras)
-   return;
+   return 0;
 
-   amdgpu_ras_register_ras_block(adev, >vcn.ras->ras_block);
+   ras = adev->vcn.ras;
+   err = amdgpu_ras_register_ras_block(adev, >ras_block);
+   if (err) {
+   dev_err(adev->dev, "Failed to register vcn ras block!\n");
+   return err;
+   }
 
-   strcpy(adev->vcn.ras->ras_block.ras_comm.name, "vcn");
-   adev->vcn.ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__VCN;
-   adev->vcn.ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__POISON;
-   adev->vcn.ras_if = >vcn.ras->ras_block.ras_comm;
+   strcpy(ras->ras_block.ras_comm.name, "vcn");
+   ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__VCN;
+   ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__POISON;
+   adev->vcn.ras_if = >ras_block.ras_comm;
 
-   /* If don't define special ras_late_init function, use default 
ras_late_init */
-   if (!adev->vcn.ras->ras_block.ras_late_init)
-   adev->vcn.ras->ras_block.ras_late_init = 
amdgpu_ras_block_late_init;
+   if (!ras->ras_block.ras_late_init)
+   ras->ras_block.ras_late_init = amdgpu_ras_block_late_init;
+
+   return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
index d3e2af902907..c730949ece7d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
@@ -400,6 +400,6 @@ void amdgpu_debugfs_vcn_fwlog_init(struct amdgpu_device 
*adev,
 int amdgpu_vcn_process_poison_irq(struct amdgpu_device *adev,
struct amdgpu_irq_src *source,
struct amdgpu_iv_entry *entry);
-void amdgpu_vcn_set_ras_funcs(struct amdgpu_device *adev);
+int amdgpu_vcn_ras_sw_init(struct amdgpu_device *adev);
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
index b0b0e69c6a94..223e7dfe4618 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
@@ -225,6 +225,10 @@ static int vcn_v2_5_sw_init(void *handle)
if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG)
adev->vcn.pause_dpg_mode = vcn_v2_5_pause_dpg_mode;
 
+   r = amdgpu_vcn_ras_sw_init(adev);
+   if (r)
+   return r;
+
return 0;
 }
 
@@ -2031,6 +2035,4 @@ static void vcn_v2_5_set_ras_funcs(struct amdgpu_device 
*adev)
default:
break;
}
-
-   amdgpu_vcn_set_ras_funcs(adev);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
index 43d587404c3e..720ab36f9c92 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
@@ -181,6 +181,10 @@ static int vcn_v4_0_sw_init(void *handle)
if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG)
adev->vcn.pause_dpg_mode = vcn_v4_0_pause_dpg_mode;
 
+   r = amdgpu_vcn_ras_sw_init(adev);
+   if (r)
+   return r;
+
return 0;
 }
 
@@ -2123,6 +2127,4 @@ static void vcn_v4_0_set_ras_funcs(struct amdgpu_device 
*adev)
default:
break;
}
-
-   amdgpu_vcn_set_ras_funcs(adev);
 }
-- 
2.17.1

[PATCH 04/10] drm/amdgpu: Correct gfx ras_late_init callback

2023-03-12 Thread Hawking Zhang

Use default gfx ras_late_init callback for gfx
ras block.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 35ed46b9249c..c50d59855011 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -725,7 +725,7 @@ int amdgpu_gfx_ras_sw_init(struct amdgpu_device *adev)
 
/* If not define special ras_late_init function, use gfx default 
ras_late_init */
if (!ras->ras_block.ras_late_init)
-   ras->ras_block.ras_late_init = amdgpu_ras_block_late_init;
+   ras->ras_block.ras_late_init = amdgpu_gfx_ras_late_init;
 
/* If not defined special ras_cb function, use default ras_cb */
if (!ras->ras_block.ras_cb)
-- 
2.17.1

[PATCH 01/10] drm/amdgpu: Move jpeg ras block init to ras sw_init

2023-03-12 Thread Hawking Zhang

Initialize jpeg ras block only when jpeg ip block
supports ras features. Driver queries ras capabilities
after early_init, ras block init needs to be moved to
sw_int.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c | 29 
 drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h |  2 +-
 drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c   |  6 +++--
 drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c   |  6 +++--
 4 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
index 8f472517d181..74695161cf1b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
@@ -235,19 +235,28 @@ int amdgpu_jpeg_process_poison_irq(struct amdgpu_device 
*adev,
return 0;
 }
 
-void jpeg_set_ras_funcs(struct amdgpu_device *adev)
+int amdgpu_jpeg_ras_sw_init(struct amdgpu_device *adev)
 {
+   int err;
+   struct amdgpu_jpeg_ras *ras;
+
if (!adev->jpeg.ras)
-   return;
+   return 0;
 
-   amdgpu_ras_register_ras_block(adev, >jpeg.ras->ras_block);
+   ras = adev->jpeg.ras;
+   err = amdgpu_ras_register_ras_block(adev, >ras_block);
+   if (err) {
+   dev_err(adev->dev, "Failed to register jpeg ras block!\n");
+   return err;
+   }
 
-   strcpy(adev->jpeg.ras->ras_block.ras_comm.name, "jpeg");
-   adev->jpeg.ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__JPEG;
-   adev->jpeg.ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__POISON;
-   adev->jpeg.ras_if = >jpeg.ras->ras_block.ras_comm;
+   strcpy(ras->ras_block.ras_comm.name, "jpeg");
+   ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__JPEG;
+   ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__POISON;
+   adev->jpeg.ras_if = >ras_block.ras_comm;
 
-   /* If don't define special ras_late_init function, use default 
ras_late_init */
-   if (!adev->jpeg.ras->ras_block.ras_late_init)
-   adev->jpeg.ras->ras_block.ras_late_init = 
amdgpu_ras_block_late_init;
+   if (!ras->ras_block.ras_late_init)
+   ras->ras_block.ras_late_init = amdgpu_ras_block_late_init;
+
+   return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h
index e8ca3e32ad52..0ca76f0f23e9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h
@@ -72,6 +72,6 @@ int amdgpu_jpeg_dec_ring_test_ib(struct amdgpu_ring *ring, 
long timeout);
 int amdgpu_jpeg_process_poison_irq(struct amdgpu_device *adev,
struct amdgpu_irq_src *source,
struct amdgpu_iv_entry *entry);
-void jpeg_set_ras_funcs(struct amdgpu_device *adev);
+int amdgpu_jpeg_ras_sw_init(struct amdgpu_device *adev);
 
 #endif /*__AMDGPU_JPEG_H__*/
diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c 
b/drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c
index f2b743a93915..6b1887808782 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c
@@ -138,6 +138,10 @@ static int jpeg_v2_5_sw_init(void *handle)
adev->jpeg.inst[i].external.jpeg_pitch = SOC15_REG_OFFSET(JPEG, 
i, mmUVD_JPEG_PITCH);
}
 
+   r = amdgpu_jpeg_ras_sw_init(adev);
+   if (r)
+   return r;
+
return 0;
 }
 
@@ -806,6 +810,4 @@ static void jpeg_v2_5_set_ras_funcs(struct amdgpu_device 
*adev)
default:
break;
}
-
-   jpeg_set_ras_funcs(adev);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c 
b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c
index 3beb731b2ce5..3129094baccc 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c
@@ -113,6 +113,10 @@ static int jpeg_v4_0_sw_init(void *handle)
adev->jpeg.internal.jpeg_pitch = regUVD_JPEG_PITCH_INTERNAL_OFFSET;
adev->jpeg.inst->external.jpeg_pitch = SOC15_REG_OFFSET(JPEG, 0, 
regUVD_JPEG_PITCH);
 
+   r = amdgpu_jpeg_ras_sw_init(adev);
+   if (r)
+   return r;
+
return 0;
 }
 
@@ -685,6 +689,4 @@ static void jpeg_v4_0_set_ras_funcs(struct amdgpu_device 
*adev)
default:
break;
}
-
-   jpeg_set_ras_funcs(adev);
 }
-- 
2.17.1

[PATCH 00/10] add ras sw_init (v2)

2023-03-12 Thread Hawking Zhang

We are moving from soc ras to ip ras to address issues as follows
- RAS sw block init is mixed in early_init and sw_init
- RAS cap check is mixed with both soc check and ip check.

RAS cap check is now only avaialble in amdgpu_ras_init,
based on the cap query from bios. RAS sw block init is all
moved to ras sw_init and follows ip based ras cap check
from amdgpu_ras_init, instead of the check in soc level.

v2: simplify the ras check (Stanley/Tao)

Hawking Zhang (10):
  drm/amdgpu: Move jpeg ras block init to ras sw_init
  drm/amdgpu: Move vcn ras block init to ras sw_init
  drm/amdgpu: Move umc ras block init to gmc ras sw_init
  drm/amdgpu: Correct gfx ras_late_init callback
  drm/amdgpu: Move mmhub ras block init to ras sw_init
  drm/amdgpu: Move hdp ras block init to ras sw_init
  drm/amdgpu: Rework mca ras sw_init
  drm/amdgpu: Rework xgmi_wafl_pcs ras sw_init
  drm/amdgpu: Rework pcie_bif ras sw_init
  drm/amdgpu: drop ras check at asic level for new blocks

 drivers/gpu/drm/amd/amdgpu/Makefile   |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c   |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c   | 41 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h   |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c   | 48 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h   |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c  | 29 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h  |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c   | 72 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h   |  9 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c | 46 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c  | 23 
 drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h  |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c   | 20 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c   | 30 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h   |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c   | 29 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h   |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c  | 28 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h  |  1 +
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c| 26 ++--
 drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c| 21 ++-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 59 +++
 drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c |  5 --
 drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c|  6 +-
 drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c|  6 +-
 drivers/gpu/drm/amd/amdgpu/mca_v3_0.c | 44 +-
 drivers/gpu/drm/amd/amdgpu/mca_v3_0.h |  4 +-
 drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c |  6 +-
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c |  6 +-
 31 files changed, 389 insertions(+), 186 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c

-- 
2.17.1

[PATCH] drm/amdgpu: Retire pcie_gen3_enable function

2023-03-06 Thread Hawking Zhang

Not needed since from vi. drop the function so
we don't duplicate code when introduce new asics.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/nv.c| 17 -
 drivers/gpu/drm/amd/amdgpu/soc15.c | 20 
 drivers/gpu/drm/amd/amdgpu/soc21.c | 17 -
 drivers/gpu/drm/amd/amdgpu/vi.c| 20 
 4 files changed, 74 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c
index 492a8b148227..d56cba10cd5a 100644
--- a/drivers/gpu/drm/amd/amdgpu/nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/nv.c
@@ -519,21 +519,6 @@ static int nv_set_vce_clocks(struct amdgpu_device *adev, 
u32 evclk, u32 ecclk)
return 0;
 }
 
-static void nv_pcie_gen3_enable(struct amdgpu_device *adev)
-{
-   if (pci_is_root_bus(adev->pdev->bus))
-   return;
-
-   if (amdgpu_pcie_gen2 == 0)
-   return;
-
-   if (!(adev->pm.pcie_gen_mask & (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
-   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3)))
-   return;
-
-   /* todo */
-}
-
 static void nv_program_aspm(struct amdgpu_device *adev)
 {
if (!amdgpu_device_should_use_aspm(adev))
@@ -1041,8 +1026,6 @@ static int nv_common_hw_init(void *handle)
if (adev->nbio.funcs->apply_l1_link_width_reconfig_wa)
adev->nbio.funcs->apply_l1_link_width_reconfig_wa(adev);
 
-   /* enable pcie gen2/3 link */
-   nv_pcie_gen3_enable(adev);
/* enable aspm */
nv_program_aspm(adev);
/* setup nbio registers */
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c 
b/drivers/gpu/drm/amd/amdgpu/soc15.c
index 2c37b49f5c00..1064972dc558 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -609,24 +609,6 @@ static int soc15_set_vce_clocks(struct amdgpu_device 
*adev, u32 evclk, u32 ecclk
return 0;
 }
 
-static void soc15_pcie_gen3_enable(struct amdgpu_device *adev)
-{
-   if (pci_is_root_bus(adev->pdev->bus))
-   return;
-
-   if (amdgpu_pcie_gen2 == 0)
-   return;
-
-   if (adev->flags & AMD_IS_APU)
-   return;
-
-   if (!(adev->pm.pcie_gen_mask & (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
-   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3)))
-   return;
-
-   /* todo */
-}
-
 static void soc15_program_aspm(struct amdgpu_device *adev)
 {
if (!amdgpu_device_should_use_aspm(adev))
@@ -1183,8 +1165,6 @@ static int soc15_common_hw_init(void *handle)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
-   /* enable pcie gen2/3 link */
-   soc15_pcie_gen3_enable(adev);
/* enable aspm */
soc15_program_aspm(adev);
/* setup nbio registers */
diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c 
b/drivers/gpu/drm/amd/amdgpu/soc21.c
index 21e271877c4c..e56f2bc73930 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc21.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc21.c
@@ -370,21 +370,6 @@ static int soc21_set_vce_clocks(struct amdgpu_device 
*adev, u32 evclk, u32 ecclk
return 0;
 }
 
-static void soc21_pcie_gen3_enable(struct amdgpu_device *adev)
-{
-   if (pci_is_root_bus(adev->pdev->bus))
-   return;
-
-   if (amdgpu_pcie_gen2 == 0)
-   return;
-
-   if (!(adev->pm.pcie_gen_mask & (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
-   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3)))
-   return;
-
-   /* todo */
-}
-
 static void soc21_program_aspm(struct amdgpu_device *adev)
 {
if (!amdgpu_device_should_use_aspm(adev))
@@ -714,8 +699,6 @@ static int soc21_common_hw_init(void *handle)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
-   /* enable pcie gen2/3 link */
-   soc21_pcie_gen3_enable(adev);
/* enable aspm */
soc21_program_aspm(adev);
/* setup nbio registers */
diff --git a/drivers/gpu/drm/amd/amdgpu/vi.c b/drivers/gpu/drm/amd/amdgpu/vi.c
index 12ef782eb478..2512b70ea992 100644
--- a/drivers/gpu/drm/amd/amdgpu/vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/vi.c
@@ -1105,24 +1105,6 @@ static int vi_set_vce_clocks(struct amdgpu_device *adev, 
u32 evclk, u32 ecclk)
return 0;
 }
 
-static void vi_pcie_gen3_enable(struct amdgpu_device *adev)
-{
-   if (pci_is_root_bus(adev->pdev->bus))
-   return;
-
-   if (amdgpu_pcie_gen2 == 0)
-   return;
-
-   if (adev->flags & AMD_IS_APU)
-   return;
-
-   if (!(adev->pm.pcie_gen_mask & (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
-   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3)))
-   return;
-
-   /* todo */
-}
-
 static void vi_enable_aspm(struct amdgpu_device *adev)
 {
u32 data, orig;
@@ -1743,8 +1725,6 @@ static int

[PATCH 2/2] drm/amdgpu: Move to common helper to query soc rev_id

2023-03-06 Thread Hawking Zhang

Replace soc15, nv, soc21 get_rev_id callback with common
helper so we don't need to duplicate code when introduce
new asics.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h|  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 
 drivers/gpu/drm/amd/amdgpu/nv.c|  7 +--
 drivers/gpu/drm/amd/amdgpu/soc15.c |  7 +--
 drivers/gpu/drm/amd/amdgpu/soc21.c |  7 +--
 5 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 9387731afb8b..527795f921a9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1149,7 +1149,7 @@ void amdgpu_device_indirect_wreg(struct amdgpu_device 
*adev,
 u32 reg_addr, u32 reg_data);
 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
   u32 reg_addr, u64 reg_data);
-
+u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev);
 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type);
 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index b1b815dc69b3..13fa8a2709c9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -807,6 +807,18 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device 
*adev,
spin_unlock_irqrestore(>pcie_idx_lock, flags);
 }
 
+/**
+ * amdgpu_device_get_rev_id - query device rev_id
+ *
+ * @adev: amdgpu_device pointer
+ *
+ * Return device rev_id
+ */
+u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev)
+{
+   return adev->nbio.funcs->get_rev_id(adev);
+}
+
 /**
  * amdgpu_invalid_rreg - dummy reg read function
  *
diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c
index 60c10132ed32..492a8b148227 100644
--- a/drivers/gpu/drm/amd/amdgpu/nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/nv.c
@@ -566,11 +566,6 @@ void nv_set_virt_ops(struct amdgpu_device *adev)
adev->virt.ops = _nv_virt_ops;
 }
 
-static uint32_t nv_get_rev_id(struct amdgpu_device *adev)
-{
-   return adev->nbio.funcs->get_rev_id(adev);
-}
-
 static bool nv_need_full_reset(struct amdgpu_device *adev)
 {
return true;
@@ -712,7 +707,7 @@ static int nv_common_early_init(void *handle)
 
adev->asic_funcs = _asic_funcs;
 
-   adev->rev_id = nv_get_rev_id(adev);
+   adev->rev_id = amdgpu_device_get_rev_id(adev);
adev->external_rev_id = 0xff;
/* TODO: split the GC and PG flags based on the relevant IP version for 
which
 * they are relevant.
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c 
b/drivers/gpu/drm/amd/amdgpu/soc15.c
index 6392d10e6eaf..2c37b49f5c00 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -653,11 +653,6 @@ const struct amdgpu_ip_block_version 
vega10_common_ip_block =
.funcs = _common_ip_funcs,
 };
 
-static uint32_t soc15_get_rev_id(struct amdgpu_device *adev)
-{
-   return adev->nbio.funcs->get_rev_id(adev);
-}
-
 static void soc15_reg_base_init(struct amdgpu_device *adev)
 {
/* Set IP register base before any HW register access */
@@ -907,7 +902,7 @@ static int soc15_common_early_init(void *handle)
adev->se_cac_rreg = _se_cac_rreg;
adev->se_cac_wreg = _se_cac_wreg;
 
-   adev->rev_id = soc15_get_rev_id(adev);
+   adev->rev_id = amdgpu_device_get_rev_id(adev);
adev->external_rev_id = 0xFF;
/* TODO: split the GC and PG flags based on the relevant IP version for 
which
 * they are relevant.
diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c 
b/drivers/gpu/drm/amd/amdgpu/soc21.c
index 9d91e20a22bb..21e271877c4c 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc21.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc21.c
@@ -411,11 +411,6 @@ const struct amdgpu_ip_block_version soc21_common_ip_block 
=
.funcs = _common_ip_funcs,
 };
 
-static uint32_t soc21_get_rev_id(struct amdgpu_device *adev)
-{
-   return adev->nbio.funcs->get_rev_id(adev);
-}
-
 static bool soc21_need_full_reset(struct amdgpu_device *adev)
 {
switch (adev->ip_versions[GC_HWIP][0]) {
@@ -557,7 +552,7 @@ static int soc21_common_early_init(void *handle)
 
adev->asic_funcs = _asic_funcs;
 
-   adev->rev_id = soc21_get_rev_id(adev);
+   adev->rev_id = amdgpu_device_get_rev_id(adev);
adev->external_rev_id = 0xff;
switch (adev->ip_versions[GC_HWIP][0]) {
case IP_VERSION(11, 0, 0):
-- 
2.17.1

[PATCH 1/2] drm/amdgpu: Move to common indirect reg access helper

2023-03-06 Thread Hawking Zhang

Replace soc15, nv, soc21 specific callbacks with common
one. so we don't need to duplicate code when introduce
new asics.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h|  4 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 32 +++---
 drivers/gpu/drm/amd/amdgpu/nv.c| 49 ++
 drivers/gpu/drm/amd/amdgpu/soc15.c | 49 ++
 drivers/gpu/drm/amd/amdgpu/soc21.c | 48 ++---
 5 files changed, 30 insertions(+), 152 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 07e846510005..9387731afb8b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1142,16 +1142,12 @@ void amdgpu_mm_wreg8(struct amdgpu_device *adev, 
uint32_t offset, uint8_t value)
 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset);
 
 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
-   u32 pcie_index, u32 pcie_data,
u32 reg_addr);
 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
- u32 pcie_index, u32 pcie_data,
  u32 reg_addr);
 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
-u32 pcie_index, u32 pcie_data,
 u32 reg_addr, u32 reg_data);
 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
-  u32 pcie_index, u32 pcie_data,
   u32 reg_addr, u64 reg_data);
 
 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 77e6c8bf7190..b1b815dc69b3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -676,20 +676,20 @@ void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, 
u32 index, u64 v)
  * amdgpu_device_indirect_rreg - read an indirect register
  *
  * @adev: amdgpu_device pointer
- * @pcie_index: mmio register offset
- * @pcie_data: mmio register offset
  * @reg_addr: indirect register address to read from
  *
  * Returns the value of indirect register @reg_addr
  */
 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
-   u32 pcie_index, u32 pcie_data,
u32 reg_addr)
 {
-   unsigned long flags;
-   u32 r;
+   unsigned long flags, pcie_index, pcie_data;
void __iomem *pcie_index_offset;
void __iomem *pcie_data_offset;
+   u32 r;
+
+   pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
+   pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
 
spin_lock_irqsave(>pcie_idx_lock, flags);
pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
@@ -707,20 +707,20 @@ u32 amdgpu_device_indirect_rreg(struct amdgpu_device 
*adev,
  * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
  *
  * @adev: amdgpu_device pointer
- * @pcie_index: mmio register offset
- * @pcie_data: mmio register offset
  * @reg_addr: indirect register address to read from
  *
  * Returns the value of indirect register @reg_addr
  */
 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
- u32 pcie_index, u32 pcie_data,
  u32 reg_addr)
 {
-   unsigned long flags;
-   u64 r;
+   unsigned long flags, pcie_index, pcie_data;
void __iomem *pcie_index_offset;
void __iomem *pcie_data_offset;
+   u64 r;
+
+   pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
+   pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
 
spin_lock_irqsave(>pcie_idx_lock, flags);
pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
@@ -750,13 +750,15 @@ u64 amdgpu_device_indirect_rreg64(struct amdgpu_device 
*adev,
  *
  */
 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
-u32 pcie_index, u32 pcie_data,
 u32 reg_addr, u32 reg_data)
 {
-   unsigned long flags;
+   unsigned long flags, pcie_index, pcie_data;
void __iomem *pcie_index_offset;
void __iomem *pcie_data_offset;
 
+   pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
+   pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
+
spin_lock_irqsave(>pcie_idx_lock, flags);
pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
@@ -779,13 +781,15 @@ void amdgpu_device_indirect_wreg(struct amdgpu_device 
*adev,
  *
  */
 void amdgpu_device_indirect_wreg64(struct amdgpu_device *ade

[PATCH 11/11] drm/amdgpu: drop ras check at asic level for new blocks

2023-03-05 Thread Hawking Zhang

amdgpu_ras_register_ras_block should always be invoked
by ras_sw_init, where driver needs to check ras caps
at ip level, instead of asic level.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index f42480b8a8d3..e6b4b0bf622b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3081,9 +3081,6 @@ int amdgpu_ras_register_ras_block(struct amdgpu_device 
*adev,
if (!adev || !ras_block_obj)
return -EINVAL;
 
-   if (!amdgpu_ras_asic_supported(adev))
-   return 0;
-
ras_node = kzalloc(sizeof(*ras_node), GFP_KERNEL);
if (!ras_node)
return -ENOMEM;
-- 
2.17.1

[PATCH 10/11] drm/amdgpu: Rework pcie_bif ras sw_init

2023-03-05 Thread Hawking Zhang

pcie_bif ras blocks needs to be initialized as early
as possible to handle fatal error detected in hw_init
phase. also align the pcie_bif ras sw_init with other
ras blocks

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c | 23 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  | 16 
 3 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
index 37d779b8e4a6..a3bc00577a7c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c
@@ -22,6 +22,29 @@
 #include "amdgpu.h"
 #include "amdgpu_ras.h"
 
+int amdgpu_nbio_ras_sw_init(struct amdgpu_device *adev)
+{
+   int err;
+   struct amdgpu_nbio_ras *ras;
+
+   if (!adev->nbio.ras)
+   return 0;
+
+   ras = adev->nbio.ras;
+   err = amdgpu_ras_register_ras_block(adev, >ras_block);
+   if (err) {
+   dev_err(adev->dev, "Failed to register pcie_bif ras block!\n");
+   return err;
+   }
+
+   strcpy(ras->ras_block.ras_comm.name, "pcie_bif");
+   ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__PCIE_BIF;
+   ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+   adev->nbio.ras_if = >ras_block.ras_comm;
+
+   return 0;
+}
+
 int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev, struct ras_common_if 
*ras_block)
 {
int r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
index a240336bbc6b..c686ff4bcc39 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
@@ -106,5 +106,6 @@ struct amdgpu_nbio {
struct amdgpu_nbio_ras  *ras;
 };
 
+int amdgpu_nbio_ras_sw_init(struct amdgpu_device *adev);
 int amdgpu_nbio_ras_late_init(struct amdgpu_device *adev, struct ras_common_if 
*ras_block);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 63dfcc98152d..f42480b8a8d3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2558,17 +2558,25 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
case CHIP_VEGA20:
case CHIP_ARCTURUS:
case CHIP_ALDEBARAN:
-   if (!adev->gmc.xgmi.connected_to_cpu) {
+   if (!adev->gmc.xgmi.connected_to_cpu)
adev->nbio.ras = _v7_4_ras;
-   amdgpu_ras_register_ras_block(adev, 
>nbio.ras->ras_block);
-   adev->nbio.ras_if = >nbio.ras->ras_block.ras_comm;
-   }
break;
default:
/* nbio ras is not available */
break;
}
 
+   /* nbio ras block needs to be enabled ahead of other ras blocks
+* to handle fatal error */
+   if (!adev->gmc.xgmi.connected_to_cpu &&
+   amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__PCIE_BIF)) {
+   r = amdgpu_nbio_ras_sw_init(adev);
+   if (r) {
+   dev_err(adev->dev, "Failed to initialize pcie_bif ras 
block!\n");
+   return r;
+   }
+   }
+
if (adev->nbio.ras &&
adev->nbio.ras->init_ras_controller_interrupt) {
r = adev->nbio.ras->init_ras_controller_interrupt(adev);
-- 
2.17.1

[PATCH 09/11] drm/amdgpu: Rework xgmi_wafl_pcs ras sw_init

2023-03-05 Thread Hawking Zhang

To align with other IP blocks.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c  |  9 +---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 28 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h |  1 +
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c|  7 ++
 4 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 524e2c9b3012..d4685d22be60 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -500,9 +500,12 @@ int amdgpu_gmc_ras_sw_init(struct amdgpu_device *adev)
 
/* xgmi ras block */
if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL)) {
-   adev->gmc.xgmi.ras = _ras;
-   amdgpu_ras_register_ras_block(adev, 
>gmc.xgmi.ras->ras_block);
-   adev->gmc.xgmi.ras_if = >gmc.xgmi.ras->ras_block.ras_comm;
+   r = amdgpu_xgmi_ras_sw_init(adev);
+   if (r) {
+   dev_err(adev->dev, "Failed to initialize xgmi_wafl_pcs 
ras block!\n");
+   return r;
+   }
+
}
 
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index fef1575cd0cf..3fe24348d199 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -1048,12 +1048,30 @@ struct amdgpu_ras_block_hw_ops  xgmi_ras_hw_ops = {
 
 struct amdgpu_xgmi_ras xgmi_ras = {
.ras_block = {
-   .ras_comm = {
-   .name = "xgmi_wafl",
-   .block = AMDGPU_RAS_BLOCK__XGMI_WAFL,
-   .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
-   },
.hw_ops = _ras_hw_ops,
.ras_late_init = amdgpu_xgmi_ras_late_init,
},
 };
+
+int amdgpu_xgmi_ras_sw_init(struct amdgpu_device *adev)
+{
+   int err;
+   struct amdgpu_xgmi_ras *ras;
+
+   if (!adev->gmc.xgmi.ras)
+   return 0;
+
+   ras = adev->gmc.xgmi.ras;
+   err = amdgpu_ras_register_ras_block(adev, >ras_block);
+   if (err) {
+   dev_err(adev->dev, "Failed to register xgmi_wafl_pcs ras 
block!\n");
+   return err;
+   }
+
+   strcpy(ras->ras_block.ras_comm.name, "xgmi_wafl_pcs");
+   ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__XGMI_WAFL;
+   ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+   adev->gmc.xgmi.ras_if = >ras_block.ras_comm;
+
+   return 0;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index 30dcc1681b4e..86fbf56938f4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -73,5 +73,6 @@ static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device 
*adev,
adev->gmc.xgmi.hive_id &&
adev->gmc.xgmi.hive_id == bo_adev->gmc.xgmi.hive_id);
 }
+int amdgpu_xgmi_ras_sw_init(struct amdgpu_device *adev);
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 67c2a5186b8a..2a8dc9b52c2d 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1381,6 +1381,12 @@ static void gmc_v9_0_set_mca_ras_funcs(struct 
amdgpu_device *adev)
}
 }
 
+static void gmc_v9_0_set_xgmi_ras_funcs(struct amdgpu_device *adev)
+{
+   if (!adev->gmc.xgmi.connected_to_cpu)
+   adev->gmc.xgmi.ras = _ras;
+}
+
 static int gmc_v9_0_early_init(void *handle)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
@@ -1404,6 +1410,7 @@ static int gmc_v9_0_early_init(void *handle)
gmc_v9_0_set_gfxhub_funcs(adev);
gmc_v9_0_set_hdp_ras_funcs(adev);
gmc_v9_0_set_mca_ras_funcs(adev);
+   gmc_v9_0_set_xgmi_ras_funcs(adev);
 
adev->gmc.shared_aperture_start = 0x2000ULL;
adev->gmc.shared_aperture_end =
-- 
2.17.1

[PATCH 08/11] drm/amdgpu: Rework mca ras sw_init

2023-03-05 Thread Hawking Zhang

To align with other IP blocks

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 21 
 drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c | 72 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h |  9 ++--
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   | 15 +++---
 drivers/gpu/drm/amd/amdgpu/mca_v3_0.c   | 44 ++-
 drivers/gpu/drm/amd/amdgpu/mca_v3_0.h   |  4 +-
 6 files changed, 111 insertions(+), 54 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 087a75374610..524e2c9b3012 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -477,6 +477,27 @@ int amdgpu_gmc_ras_sw_init(struct amdgpu_device *adev)
}
}
 
+   /* mca.x ras block */
+   if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__MCA)) {
+   r = amdgpu_mca_mp0_ras_sw_init(adev);
+   if (r) {
+   dev_err(adev->dev, "Failed to initialize mca.mp0 ras 
block!\n");
+   return r;
+   }
+
+   r = amdgpu_mca_mp1_ras_sw_init(adev);
+   if (r) {
+   dev_err(adev->dev, "Failed to initialize mca.mp1 ras 
block!\n");
+   return r;
+   }
+
+   r = amdgpu_mca_mpio_ras_sw_init(adev);
+   if (r) {
+   dev_err(adev->dev, "Failed to initialize mca.mpio ras 
block!\n");
+   return r;
+   }
+   }
+
/* xgmi ras block */
if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL)) {
adev->gmc.xgmi.ras = _ras;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
index 51c2a82e2fa4..0b545bdcd636 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
@@ -70,3 +70,75 @@ void amdgpu_mca_query_ras_error_count(struct amdgpu_device 
*adev,
 
amdgpu_mca_reset_error_count(adev, mc_status_addr);
 }
+
+int amdgpu_mca_mp0_ras_sw_init(struct amdgpu_device *adev)
+{
+   int err;
+   struct amdgpu_mca_ras_block *ras;
+
+   if (!adev->mca.mp0.ras)
+   return 0;
+
+   ras = adev->mca.mp0.ras;
+
+   err = amdgpu_ras_register_ras_block(adev, >ras_block);
+   if (err) {
+   dev_err(adev->dev, "Failed to register mca.mp0 ras block!\n");
+   return err;
+   }
+
+   strcpy(ras->ras_block.ras_comm.name, "mca.mp0");
+   ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__MCA;
+   ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+   adev->mca.mp0.ras_if = >ras_block.ras_comm;
+
+   return 0;
+}
+
+int amdgpu_mca_mp1_ras_sw_init(struct amdgpu_device *adev)
+{
+int err;
+struct amdgpu_mca_ras_block *ras;
+
+if (!adev->mca.mp1.ras)
+return 0;
+
+ras = adev->mca.mp1.ras;
+
+err = amdgpu_ras_register_ras_block(adev, >ras_block);
+if (err) {
+dev_err(adev->dev, "Failed to register mca.mp1 ras block!\n");
+return err;
+}
+
+strcpy(ras->ras_block.ras_comm.name, "mca.mp1");
+ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__MCA;
+ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+adev->mca.mp1.ras_if = >ras_block.ras_comm;
+
+return 0;
+}
+
+int amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device *adev)
+{
+int err;
+struct amdgpu_mca_ras_block *ras;
+
+if (!adev->mca.mpio.ras)
+return 0;
+
+ras = adev->mca.mpio.ras;
+
+err = amdgpu_ras_register_ras_block(adev, >ras_block);
+if (err) {
+dev_err(adev->dev, "Failed to register mca.mpio ras block!\n");
+return err;
+}
+
+strcpy(ras->ras_block.ras_comm.name, "mca.mpio");
+ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__MCA;
+ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+adev->mca.mpio.ras_if = >ras_block.ras_comm;
+
+return 0;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
index 7ce16d16e34b..997a073e2409 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
@@ -30,12 +30,7 @@ struct amdgpu_mca_ras {
struct amdgpu_mca_ras_block *ras;
 };
 
-struct amdgpu_mca_funcs {
-   void (*init)(struct amdgpu_device *adev);
-};
-
 struct amdgpu_mca {
-   const struct amdgpu_mca_funcs *funcs;
struct amdgpu_mca_ras mp0;
struct amdgpu_mca_ras mp1;
struct amdgpu_mca_ras mpio;
@@ -55,5 +

[PATCH 07/11] drm/amdgpu: Move hdp ras block init to ras sw_init

2023-03-05 Thread Hawking Zhang

Initialize hdp ras block only when mmhub ip block
supports ras features. Driver queries ras capabilities
after early_init, ras block init needs to be moved to
sw_init.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/Makefile |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c |  9 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c | 48 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h |  2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  2 --
 drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c   |  5 ---
 6 files changed, 59 insertions(+), 9 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
b/drivers/gpu/drm/amd/amdgpu/Makefile
index 00c33ce38761..5f9ac1bcb6bc 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -54,7 +54,7 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \
amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \
amdgpu_gtt_mgr.o amdgpu_preempt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o \
amdgpu_atomfirmware.o amdgpu_vf_error.o amdgpu_sched.o \
-   amdgpu_debugfs.o amdgpu_ids.o amdgpu_gmc.o amdgpu_mmhub.o \
+   amdgpu_debugfs.o amdgpu_ids.o amdgpu_gmc.o amdgpu_mmhub.o amdgpu_hdp.o \
amdgpu_xgmi.o amdgpu_csa.o amdgpu_ras.o amdgpu_vm_cpu.o \
amdgpu_vm_sdma.o amdgpu_discovery.o amdgpu_ras_eeprom.o amdgpu_nbio.o \
amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index e90c59693c31..087a75374610 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -468,6 +468,15 @@ int amdgpu_gmc_ras_sw_init(struct amdgpu_device *adev)
}
}
 
+   /* hdp ras block */
+   if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__HDP)) {
+   r = amdgpu_hdp_ras_sw_init(adev);
+   if (r) {
+   dev_err(adev->dev, "Failed to initialize hdp ras 
block!\n");
+   return r;
+   }
+   }
+
/* xgmi ras block */
if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL)) {
adev->gmc.xgmi.ras = _ras;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c
new file mode 100644
index ..b6cf801939aa
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include "amdgpu.h"
+#include "amdgpu_ras.h"
+
+int amdgpu_hdp_ras_sw_init(struct amdgpu_device *adev)
+{
+   int err;
+   struct amdgpu_hdp_ras *ras;
+
+   if (!adev->hdp.ras)
+   return 0;
+
+   ras = adev->hdp.ras;
+   err = amdgpu_ras_register_ras_block(adev, >ras_block);
+   if (err) {
+   dev_err(adev->dev, "Failed to register hdp ras block!\n");
+   return err;
+   }
+
+   strcpy(ras->ras_block.ras_comm.name, "hdp");
+   ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__HDP;
+   ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+   adev->hdp.ras_if = >ras_block.ras_comm;
+
+   /* hdp ras follows amdgpu_ras_block_late_init_default for late init */
+   return 0;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h
index ac5c61d3de2b..7b8a6152dc8d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h
@@ -43,5 +43,5 @@ struct amdgpu_hdp {
struct amdgpu_hdp_ras   *ras;
 };
 
-int amdgpu_hdp_ras_late_init(struct amdgpu_device *adev, struct ras_common_if 
*ras_block);
+int amdgpu_hdp_ras_sw_i

[PATCH 06/11] drm/amdgpu: Move mmhub ras block init to ras sw_init

2023-03-05 Thread Hawking Zhang

Initialize mmhub ras block only when mmhub ip block
supports ras features. Driver queries ras capabilities
after early_init, ras block init needs to be moved to
sw_init.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/Makefile   |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c   |  9 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c | 46 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h |  2 +
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c |  9 -
 5 files changed, 58 insertions(+), 10 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
b/drivers/gpu/drm/amd/amdgpu/Makefile
index d4dfd48451ce..00c33ce38761 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -54,7 +54,7 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \
amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \
amdgpu_gtt_mgr.o amdgpu_preempt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o \
amdgpu_atomfirmware.o amdgpu_vf_error.o amdgpu_sched.o \
-   amdgpu_debugfs.o amdgpu_ids.o amdgpu_gmc.o \
+   amdgpu_debugfs.o amdgpu_ids.o amdgpu_gmc.o amdgpu_mmhub.o \
amdgpu_xgmi.o amdgpu_csa.o amdgpu_ras.o amdgpu_vm_cpu.o \
amdgpu_vm_sdma.o amdgpu_discovery.o amdgpu_ras_eeprom.o amdgpu_nbio.o \
amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 97a12d8d786a..e90c59693c31 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -459,6 +459,15 @@ int amdgpu_gmc_ras_sw_init(struct amdgpu_device *adev)
}
}
 
+   /* mmhub ras block */
+   if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__MMHUB)) {
+   r = amdgpu_mmhub_ras_sw_init(adev);
+   if (r) {
+   dev_err(adev->dev, "Failed to initialize mmhub ras 
block!\n");
+   return r;
+   }
+   }
+
/* xgmi ras block */
if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL)) {
adev->gmc.xgmi.ras = _ras;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c
new file mode 100644
index ..0f6b1021fef3
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2023  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "amdgpu.h"
+#include "amdgpu_ras.h"
+
+int amdgpu_mmhub_ras_sw_init(struct amdgpu_device *adev)
+{
+   int err;
+   struct amdgpu_mmhub_ras *ras;
+
+   if (!adev->mmhub.ras)
+   return 0;
+
+   ras = adev->mmhub.ras;
+   err = amdgpu_ras_register_ras_block(adev, >ras_block);
+   if (err) {
+   dev_err(adev->dev, "Failed to register mmhub ras block!\n");
+   return err;
+   }
+
+   strcpy(ras->ras_block.ras_comm.name, "mmhub");
+   ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__MMHUB;
+   ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+   adev->mmhub.ras_if = >ras_block.ras_comm;
+
+   /* mmhub ras follows amdgpu_ras_block_late_init_default for late init */
+   return 0;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
index 93430d3823c9..d21bb6dae56e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
@@ -48,5 +48,7 @@ struct amdgpu_mmhub {
struct amdgpu_mmhub_ras  *ras;
 };
 
+int amdgpu_mmhub_ras_sw_init(struct amdgpu_device *adev);
+
 #endif
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdg

[PATCH 05/11] drm/amdgpu: Move umc ras block init to gmc ras sw_init

2023-03-05 Thread Hawking Zhang

Initialize umc ras block only when umc ip block
supports ras. Driver queries ras capabilities after
early_init, ras block init needs to be moved to
sw_init.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 16 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 30 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |  1 +
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  | 26 -
 drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c  | 21 -
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   | 26 -
 7 files changed, 58 insertions(+), 64 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 6830f671cde7..97a12d8d786a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -446,9 +446,21 @@ void amdgpu_gmc_filter_faults_remove(struct amdgpu_device 
*adev, uint64_t addr,
} while (fault->timestamp < tmp);
 }
 
-int amdgpu_gmc_ras_early_init(struct amdgpu_device *adev)
+int amdgpu_gmc_ras_sw_init(struct amdgpu_device *adev)
 {
-   if (!adev->gmc.xgmi.connected_to_cpu) {
+   int r;
+
+   /* umc ras block */
+   if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC)) {
+   r = amdgpu_umc_ras_sw_init(adev);
+   if (r) {
+   dev_err(adev->dev, "Failed to initialize umc ras 
block!\n");
+   return r;
+   }
+   }
+
+   /* xgmi ras block */
+   if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL)) {
adev->gmc.xgmi.ras = _ras;
amdgpu_ras_register_ras_block(adev, 
>gmc.xgmi.ras->ras_block);
adev->gmc.xgmi.ras_if = >gmc.xgmi.ras->ras_block.ras_comm;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index 0305b660cd17..f1773abd5e1a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -351,7 +351,7 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev,
  uint16_t pasid, uint64_t timestamp);
 void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
 uint16_t pasid);
-int amdgpu_gmc_ras_early_init(struct amdgpu_device *adev);
+int amdgpu_gmc_ras_sw_init(struct amdgpu_device *adev);
 int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev);
 void amdgpu_gmc_ras_fini(struct amdgpu_device *adev);
 int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 1b8574bc4463..da68ceaa024c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -208,6 +208,36 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device 
*adev,
return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, 
true);
 }
 
+int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev)
+{
+   int err;
+   struct amdgpu_umc_ras *ras;
+
+   if (!adev->umc.ras)
+   return 0;
+
+   ras = adev->umc.ras;
+
+   err = amdgpu_ras_register_ras_block(adev, >ras_block);
+   if (err) {
+   dev_err(adev->dev, "Failed to register umc ras block!\n");
+   return err;
+   }
+
+   strcpy(adev->umc.ras->ras_block.ras_comm.name, "umc");
+   ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__UMC;
+   ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+   adev->umc.ras_if = >ras_block.ras_comm;
+
+   if (!ras->ras_block.ras_late_init)
+   ras->ras_block.ras_late_init = amdgpu_umc_ras_late_init;
+
+   if (ras->ras_block.ras_cb)
+   ras->ras_block.ras_cb = amdgpu_umc_process_ras_data_cb;
+
+   return 0;
+}
+
 int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if 
*ras_block)
 {
int r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 36e19336f3b3..d7f1229ff11f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -87,6 +87,7 @@ struct amdgpu_umc {
unsigned long active_mask;
 };
 
+int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev);
 int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if 
*ras_block);
 int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset);
 int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index c59c2332d191..924f6f38fae6 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c

[PATCH 04/11] drm/amdgpu: Init sdma ras block when ras is supported

2023-03-05 Thread Hawking Zhang

Initialize sdma ras block only when sdma ip block
supports ras features.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c   | 9 ++---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 8 
 drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c   | 9 ++---
 3 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index b5affba22156..805c6605b43a 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -1851,9 +1851,12 @@ static int sdma_v4_0_sw_init(void *handle)
}
}
 
-   if (amdgpu_sdma_ras_sw_init(adev)) {
-   dev_err(adev->dev, "Failed to initialize sdma ras block!\n");
-   return -EINVAL;
+   if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) {
+   r = amdgpu_sdma_ras_sw_init(adev);
+   if (r) {
+   dev_err(adev->dev, "Failed to initialize sdma ras 
block!\n");
+   return r;
+   }
}
 
return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 1b04700a4d55..9bf221cc07b0 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -1338,6 +1338,14 @@ static int sdma_v4_4_2_sw_init(void *handle)
}
}
 
+   if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) {
+   r = amdgpu_sdma_ras_sw_init(adev);
+   if (r) {
+   dev_err(adev->dev, "Failed to initialize sdma ras 
block!\n");
+   return r;
+   }
+   }
+
return r;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
index 40e6b22daa22..4613d73c2535 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
@@ -1281,9 +1281,12 @@ static int sdma_v6_0_sw_init(void *handle)
return r;
}
 
-   if (amdgpu_sdma_ras_sw_init(adev)) {
-   dev_err(adev->dev, "Failed to initialize sdma ras block!\n");
-   return -EINVAL;
+   if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) {
+   r = amdgpu_sdma_ras_sw_init(adev);
+   if (r) {
+   dev_err(adev->dev, "Failed to initialize sdma ras 
block!\n");
+   return r;
+   }
}
 
return r;
-- 
2.17.1

[PATCH 03/11] drm/amdgpu: Init gfx ras block when ras is supported

2023-03-05 Thread Hawking Zhang

Initialize gfx ras block only when gfx ip block
supports ras features.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 9 ++---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 9 ++---
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index 3bf697a80cf2..d7d4847e2644 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -1409,9 +1409,12 @@ static int gfx_v11_0_sw_init(void *handle)
if (r)
return r;
 
-   if (amdgpu_gfx_ras_sw_init(adev)) {
-   dev_err(adev->dev, "Failed to initialize gfx ras block!\n");
-   return -EINVAL;
+   if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
+   r = amdgpu_gfx_ras_sw_init(adev);
+   if (r) {
+   dev_err(adev->dev, "Failed to initialize gfx ras 
block!\n");
+   return r;
+   }
}
 
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index ae09fc1cfe6b..c9657e89d40e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -2177,9 +2177,12 @@ static int gfx_v9_0_sw_init(void *handle)
if (r)
return r;
 
-   if (amdgpu_gfx_ras_sw_init(adev)) {
-   dev_err(adev->dev, "Failed to initialize gfx ras block!\n");
-   return -EINVAL;
+   if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
+   r = amdgpu_gfx_ras_sw_init(adev);
+   if (r) {
+   dev_err(adev->dev, "Failed to initialize gfx ras 
block!\n");
+   return r;
+   }
}
 
return 0;
-- 
2.17.1

[PATCH 02/11] drm/amdgpu: Move vcn ras block init to ras sw_init

2023-03-05 Thread Hawking Zhang

Initialize vcn ras block only when vcn ip block
supports ras features. Driver queries ras capabilities
after early_init, ras block init needs to be moved to
sw_int.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 29 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  2 +-
 drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c   | 10 +++--
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c   | 10 +++--
 4 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index 02d428ddf2f8..8664a5301b2f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -1158,19 +1158,28 @@ int amdgpu_vcn_process_poison_irq(struct amdgpu_device 
*adev,
return 0;
 }
 
-void amdgpu_vcn_set_ras_funcs(struct amdgpu_device *adev)
+int amdgpu_vcn_ras_sw_init(struct amdgpu_device *adev)
 {
+   int err;
+   struct amdgpu_vcn_ras *ras;
+
if (!adev->vcn.ras)
-   return;
+   return 0;
 
-   amdgpu_ras_register_ras_block(adev, >vcn.ras->ras_block);
+   ras = adev->vcn.ras;
+   err = amdgpu_ras_register_ras_block(adev, >ras_block);
+   if (err) {
+   dev_err(adev->dev, "Failed to register vcn ras block!\n");
+   return err;
+   }
 
-   strcpy(adev->vcn.ras->ras_block.ras_comm.name, "vcn");
-   adev->vcn.ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__VCN;
-   adev->vcn.ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__POISON;
-   adev->vcn.ras_if = >vcn.ras->ras_block.ras_comm;
+   strcpy(ras->ras_block.ras_comm.name, "vcn");
+   ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__VCN;
+   ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__POISON;
+   adev->vcn.ras_if = >ras_block.ras_comm;
 
-   /* If don't define special ras_late_init function, use default 
ras_late_init */
-   if (!adev->vcn.ras->ras_block.ras_late_init)
-   adev->vcn.ras->ras_block.ras_late_init = 
amdgpu_ras_block_late_init;
+   if (!ras->ras_block.ras_late_init)
+   ras->ras_block.ras_late_init = amdgpu_ras_block_late_init;
+
+   return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
index d3e2af902907..c730949ece7d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
@@ -400,6 +400,6 @@ void amdgpu_debugfs_vcn_fwlog_init(struct amdgpu_device 
*adev,
 int amdgpu_vcn_process_poison_irq(struct amdgpu_device *adev,
struct amdgpu_irq_src *source,
struct amdgpu_iv_entry *entry);
-void amdgpu_vcn_set_ras_funcs(struct amdgpu_device *adev);
+int amdgpu_vcn_ras_sw_init(struct amdgpu_device *adev);
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
index b0b0e69c6a94..043c9e0d956c 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
@@ -225,6 +225,14 @@ static int vcn_v2_5_sw_init(void *handle)
if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG)
adev->vcn.pause_dpg_mode = vcn_v2_5_pause_dpg_mode;
 
+   if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__VCN)) {
+   r = amdgpu_vcn_ras_sw_init(adev);
+   if (r) {
+   dev_err(adev->dev, "Failed to initialize vcn ras 
block!\n");
+   return r;
+   }
+   }
+
return 0;
 }
 
@@ -2031,6 +2039,4 @@ static void vcn_v2_5_set_ras_funcs(struct amdgpu_device 
*adev)
default:
break;
}
-
-   amdgpu_vcn_set_ras_funcs(adev);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
index 43d587404c3e..73b0b1b1beec 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
@@ -181,6 +181,14 @@ static int vcn_v4_0_sw_init(void *handle)
if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG)
adev->vcn.pause_dpg_mode = vcn_v4_0_pause_dpg_mode;
 
+   if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__VCN)) {
+   r = amdgpu_vcn_ras_sw_init(adev);
+   if (r) {
+   dev_err(adev->dev, "Failed to initialize vcn ras 
block!\n");
+   return r;
+   }
+   }
+
return 0;
 }
 
@@ -2123,6 +2131,4 @@ static void vcn_v4_0_set_ras_funcs(struct amdgpu_device 
*adev)
default:
break;
}
-
-   amdgpu_vcn_set_ras_funcs(adev);
 }
-- 
2.17.1

[PATCH 01/11] drm/amdgpu: Move jpeg ras block init to ras sw_init

2023-03-05 Thread Hawking Zhang

Initialize jpeg ras block only when jpeg ip block
supports ras features. Driver queries ras capabilities
after early_init, ras block init needs to be moved to
sw_int.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c | 29 
 drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h |  2 +-
 drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c   | 10 ++--
 drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c   | 10 ++--
 4 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
index 8f472517d181..74695161cf1b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
@@ -235,19 +235,28 @@ int amdgpu_jpeg_process_poison_irq(struct amdgpu_device 
*adev,
return 0;
 }
 
-void jpeg_set_ras_funcs(struct amdgpu_device *adev)
+int amdgpu_jpeg_ras_sw_init(struct amdgpu_device *adev)
 {
+   int err;
+   struct amdgpu_jpeg_ras *ras;
+
if (!adev->jpeg.ras)
-   return;
+   return 0;
 
-   amdgpu_ras_register_ras_block(adev, >jpeg.ras->ras_block);
+   ras = adev->jpeg.ras;
+   err = amdgpu_ras_register_ras_block(adev, >ras_block);
+   if (err) {
+   dev_err(adev->dev, "Failed to register jpeg ras block!\n");
+   return err;
+   }
 
-   strcpy(adev->jpeg.ras->ras_block.ras_comm.name, "jpeg");
-   adev->jpeg.ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__JPEG;
-   adev->jpeg.ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__POISON;
-   adev->jpeg.ras_if = >jpeg.ras->ras_block.ras_comm;
+   strcpy(ras->ras_block.ras_comm.name, "jpeg");
+   ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__JPEG;
+   ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__POISON;
+   adev->jpeg.ras_if = >ras_block.ras_comm;
 
-   /* If don't define special ras_late_init function, use default 
ras_late_init */
-   if (!adev->jpeg.ras->ras_block.ras_late_init)
-   adev->jpeg.ras->ras_block.ras_late_init = 
amdgpu_ras_block_late_init;
+   if (!ras->ras_block.ras_late_init)
+   ras->ras_block.ras_late_init = amdgpu_ras_block_late_init;
+
+   return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h
index e8ca3e32ad52..0ca76f0f23e9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h
@@ -72,6 +72,6 @@ int amdgpu_jpeg_dec_ring_test_ib(struct amdgpu_ring *ring, 
long timeout);
 int amdgpu_jpeg_process_poison_irq(struct amdgpu_device *adev,
struct amdgpu_irq_src *source,
struct amdgpu_iv_entry *entry);
-void jpeg_set_ras_funcs(struct amdgpu_device *adev);
+int amdgpu_jpeg_ras_sw_init(struct amdgpu_device *adev);
 
 #endif /*__AMDGPU_JPEG_H__*/
diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c 
b/drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c
index f2b743a93915..7400ed1449e3 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c
@@ -138,6 +138,14 @@ static int jpeg_v2_5_sw_init(void *handle)
adev->jpeg.inst[i].external.jpeg_pitch = SOC15_REG_OFFSET(JPEG, 
i, mmUVD_JPEG_PITCH);
}
 
+   if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__JPEG)) {
+   r = amdgpu_jpeg_ras_sw_init(adev);
+   if (r) {
+   dev_err(adev->dev, "Failed to initialize jpeg ras 
block!\n");
+   return r;
+   }
+   }
+
return 0;
 }
 
@@ -806,6 +814,4 @@ static void jpeg_v2_5_set_ras_funcs(struct amdgpu_device 
*adev)
default:
break;
}
-
-   jpeg_set_ras_funcs(adev);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c 
b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c
index 3beb731b2ce5..f9f682336b3a 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c
@@ -113,6 +113,14 @@ static int jpeg_v4_0_sw_init(void *handle)
adev->jpeg.internal.jpeg_pitch = regUVD_JPEG_PITCH_INTERNAL_OFFSET;
adev->jpeg.inst->external.jpeg_pitch = SOC15_REG_OFFSET(JPEG, 0, 
regUVD_JPEG_PITCH);
 
+   if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__JPEG)) {
+   r = amdgpu_jpeg_ras_sw_init(adev);
+   if (r) {
+   dev_err(adev->dev, "Failed to initialize jpeg ras 
block!\n");
+   return r;
+   }
+   }
+
return 0;
 }
 
@@ -685,6 +693,4 @@ static void jpeg_v4_0_set_ras_funcs(struct amdgpu_device 
*adev)
default:
break;
}
-
-   jpeg_set_ras_funcs(adev);
 }
-- 
2.17.1

[PATCH 00/11] Add ras sw_init

2023-03-05 Thread Hawking Zhang

We are moving from soc ras to ip ras to address issues
as follows
- RAS sw block init is mixed in early_init and sw_init
- RAS cap check is mixed with both soc check and ip check.

RAS cap check is now only avaialble in amdgpu_ras_init,
based on the cap query from bios. RAS sw block init is
all moved to ras sw_init and follows ip based ras cap
check from amdgpu_ras_init, instead of the check in
soc level.


Hawking Zhang (11):
  drm/amdgpu: Move jpeg ras block init to ras sw_init
  drm/amdgpu: Move vcn ras block init to ras sw_init
  drm/amdgpu: Init gfx ras block when ras is supported
  drm/amdgpu: Init sdma ras block when ras is supported
  drm/amdgpu: Move umc ras block init to gmc ras sw_init
  drm/amdgpu: Move mmhub ras block init to ras sw_init
  drm/amdgpu: Move hdp ras block init to ras sw_init
  drm/amdgpu: Rework mca ras sw_init
  drm/amdgpu: Rework xgmi_wafl_pcs ras sw_init
  drm/amdgpu: Rework pcie_bif ras sw_init
  drm/amdgpu: drop ras check at asic level for new blocks

 drivers/gpu/drm/amd/amdgpu/Makefile   |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c   | 64 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h   |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c   | 48 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.h   |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c  | 29 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.h  |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c   | 72 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h   |  9 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c | 46 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.c  | 23 
 drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h  |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c   | 19 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c   | 30 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h   |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c   | 29 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h   |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c  | 28 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h  |  1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c|  9 ++-
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c |  9 ++-
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c| 26 ++--
 drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c| 21 ++-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 59 +++
 drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c |  5 --
 drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c| 10 +++-
 drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c| 10 +++-
 drivers/gpu/drm/amd/amdgpu/mca_v3_0.c | 44 +-
 drivers/gpu/drm/amd/amdgpu/mca_v3_0.h |  4 +-
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c|  9 ++-
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c  |  8 +++
 drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c|  9 ++-
 drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c | 10 +++-
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c | 10 +++-
 35 files changed, 462 insertions(+), 193 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.c

-- 
2.17.1

[PATCH] drm/amdgpu: fix incorrect active rb bitmap for gfx11

2023-02-19 Thread Hawking Zhang

GFX v11 changes RB_BACKEND_DISABLE related registers
from per SA to global ones. The approach to query active
rb bitmap needs to be changed accordingly. Query per
SE setting returns wrong active RB bitmap especially
in the case when some of SA are disabled. With the new
approach, driver will generate the active rb bitmap
based on active SA bitmap and global active RB bitmap.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 78 +-
 1 file changed, 52 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index e7e5a2c31896..7b7f01b304cb 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -1503,44 +1503,70 @@ static void gfx_v11_0_select_se_sh(struct amdgpu_device 
*adev, u32 se_num,
WREG32_SOC15(GC, 0, regGRBM_GFX_INDEX, data);
 }
 
-static u32 gfx_v11_0_get_rb_active_bitmap(struct amdgpu_device *adev)
+static u32 gfx_v11_0_get_sa_active_bitmap(struct amdgpu_device *adev)
 {
-   u32 data, mask;
+   u32 gc_disabled_sa_mask, gc_user_disabled_sa_mask, sa_mask;
+
+   gc_disabled_sa_mask = RREG32_SOC15(GC, 0, regCC_GC_SA_UNIT_DISABLE);
+   gc_disabled_sa_mask = REG_GET_FIELD(gc_disabled_sa_mask,
+  CC_GC_SA_UNIT_DISABLE,
+  SA_DISABLE);
+   gc_user_disabled_sa_mask = RREG32_SOC15(GC, 0, 
regGC_USER_SA_UNIT_DISABLE);
+   gc_user_disabled_sa_mask = REG_GET_FIELD(gc_user_disabled_sa_mask,
+GC_USER_SA_UNIT_DISABLE,
+SA_DISABLE);
+   sa_mask = amdgpu_gfx_create_bitmask(adev->gfx.config.max_sh_per_se *
+   
adev->gfx.config.max_shader_engines);
 
-   data = RREG32_SOC15(GC, 0, regCC_RB_BACKEND_DISABLE);
-   data |= RREG32_SOC15(GC, 0, regGC_USER_RB_BACKEND_DISABLE);
+   return sa_mask & (~(gc_disabled_sa_mask | gc_user_disabled_sa_mask));
+}
 
-   data &= CC_RB_BACKEND_DISABLE__BACKEND_DISABLE_MASK;
-   data >>= GC_USER_RB_BACKEND_DISABLE__BACKEND_DISABLE__SHIFT;
+static u32 gfx_v11_0_get_rb_active_bitmap(struct amdgpu_device *adev)
+{
+   u32 gc_disabled_rb_mask, gc_user_disabled_rb_mask;
+   u32 rb_mask;
 
-   mask = amdgpu_gfx_create_bitmask(adev->gfx.config.max_backends_per_se /
-adev->gfx.config.max_sh_per_se);
+   gc_disabled_rb_mask = RREG32_SOC15(GC, 0, regCC_RB_BACKEND_DISABLE);
+   gc_disabled_rb_mask = REG_GET_FIELD(gc_disabled_rb_mask,
+   CC_RB_BACKEND_DISABLE,
+   BACKEND_DISABLE);
+   gc_user_disabled_rb_mask = RREG32_SOC15(GC, 0, 
regGC_USER_RB_BACKEND_DISABLE);
+   gc_user_disabled_rb_mask = REG_GET_FIELD(gc_user_disabled_rb_mask,
+GC_USER_RB_BACKEND_DISABLE,
+BACKEND_DISABLE);
+   rb_mask = 
amdgpu_gfx_create_bitmask(adev->gfx.config.max_backends_per_se *
+   
adev->gfx.config.max_shader_engines);
 
-   return (~data) & mask;
+   return rb_mask & (~(gc_disabled_rb_mask | gc_user_disabled_rb_mask));
 }
 
 static void gfx_v11_0_setup_rb(struct amdgpu_device *adev)
 {
-   int i, j;
-   u32 data;
-   u32 active_rbs = 0;
-   u32 rb_bitmap_width_per_sh = adev->gfx.config.max_backends_per_se /
-   adev->gfx.config.max_sh_per_se;
+   u32 rb_bitmap_width_per_sa;
+   u32 max_sa;
+   u32 active_sa_bitmap;
+   u32 global_active_rb_bitmap;
+   u32 active_rb_bitmap = 0;
+   u32 i;
 
-   mutex_lock(>grbm_idx_mutex);
-   for (i = 0; i < adev->gfx.config.max_shader_engines; i++) {
-   for (j = 0; j < adev->gfx.config.max_sh_per_se; j++) {
-   gfx_v11_0_select_se_sh(adev, i, j, 0x);
-   data = gfx_v11_0_get_rb_active_bitmap(adev);
-   active_rbs |= data << ((i * 
adev->gfx.config.max_sh_per_se + j) *
-  rb_bitmap_width_per_sh);
-   }
+   /* query sa bitmap from SA_UNIT_DISABLE registers */
+   active_sa_bitmap = gfx_v11_0_get_sa_active_bitmap(adev);
+   /* query rb bitmap from RB_BACKEND_DISABLE registers */
+   global_active_rb_bitmap = gfx_v11_0_get_rb_active_bitmap(adev);
+
+   /* generate active rb bitmap according to active sa bitmap */
+   max_sa = adev->gfx.config.max_shader_engines *
+adev->gfx.config.max_sh_per_se;
+   rb_bitmap_width_per_sa = adev->gfx.config.max_backends_per_se /
+adev->gfx.config.max_sh_per_se

[PATCH] drm/amdgpu: fix incorrect active rb bitmap for gfx11

2023-02-18 Thread Hawking Zhang

GFX v11 changes RB_BACKEND_DISABLE related registers
from per SA to global ones. The approach to query active
rb bitmap needs to be changed accordingly. Query
per SE setting returns wrong active RB bitmap especially
in the case when some of SA are disabled. With the new
approach, driver will generate the active rb bitmap
based on active SA bitmap and global active RB bitmap.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 75 +-
 1 file changed, 49 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index e7e5a2c31896..87a6cdac3d45 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -1503,44 +1503,67 @@ static void gfx_v11_0_select_se_sh(struct amdgpu_device 
*adev, u32 se_num,
WREG32_SOC15(GC, 0, regGRBM_GFX_INDEX, data);
 }
 
-static u32 gfx_v11_0_get_rb_active_bitmap(struct amdgpu_device *adev)
+static u32 gfx_v11_0_get_sa_active_bitmap(struct amdgpu_device *adev)
 {
-   u32 data, mask;
+   u32 gc_disabled_sa_mask, gc_user_disabled_sa_mask, sa_mask;
+
+   gc_disabled_sa_mask = RREG32_SOC15(GC, 0, regCC_GC_SA_UNIT_DISABLE);
+   gc_disabled_sa_mask = REG_GET_FIELD(gc_disabled_sa_mask,
+  CC_GC_SA_UNIT_DISABLE,
+  SA_DISABLE);
+   gc_user_disabled_sa_mask = RREG32_SOC15(GC, 0, 
regGC_USER_SA_UNIT_DISABLE);
+   gc_user_disabled_sa_mask = REG_GET_FIELD(gc_user_disabled_sa_mask,
+GC_USER_SA_UNIT_DISABLE,
+SA_DISABLE);
+   sa_mask = amdgpu_gfx_create_bitmask(adev->gfx.config.max_sh_per_se *
+   
adev->gfx.config.max_shader_engines);
 
-   data = RREG32_SOC15(GC, 0, regCC_RB_BACKEND_DISABLE);
-   data |= RREG32_SOC15(GC, 0, regGC_USER_RB_BACKEND_DISABLE);
+   return sa_mask & (~(gc_disabled_sa_mask | gc_user_disabled_sa_mask));
+}
 
-   data &= CC_RB_BACKEND_DISABLE__BACKEND_DISABLE_MASK;
-   data >>= GC_USER_RB_BACKEND_DISABLE__BACKEND_DISABLE__SHIFT;
+static u32 gfx_v11_0_get_rb_active_bitmap(struct amdgpu_device *adev)
+{
+   u32 gc_disabled_rb_mask, gc_user_disabled_rb_mask;
+   u32 rb_mask;
 
-   mask = amdgpu_gfx_create_bitmask(adev->gfx.config.max_backends_per_se /
-adev->gfx.config.max_sh_per_se);
+   gc_disabled_rb_mask = RREG32_SOC15(GC, 0, regCC_RB_BACKEND_DISABLE);
+   gc_disabled_rb_mask = REG_GET_FIELD(gc_disabled_rb_mask,
+   CC_RB_BACKEND_DISABLE,
+   BACKEND_DISABLE);
+   gc_user_disabled_rb_mask = RREG32_SOC15(GC, 0, 
regGC_USER_RB_BACKEND_DISABLE);
+   gc_user_disabled_rb_mask = REG_GET_FIELD(gc_user_disabled_rb_mask,
+GC_USER_RB_BACKEND_DISABLE,
+BACKEND_DISABLE);
+   rb_mask = 
amdgpu_gfx_create_bitmask(adev->gfx.config.max_backends_per_se *
+   
adev->gfx.config.max_shader_engines);
 
-   return (~data) & mask;
+   return rb_mask & (~(gc_disabled_rb_mask | gc_user_disabled_rb_mask));
 }
 
 static void gfx_v11_0_setup_rb(struct amdgpu_device *adev)
 {
-   int i, j;
-   u32 data;
-   u32 active_rbs = 0;
-   u32 rb_bitmap_width_per_sh = adev->gfx.config.max_backends_per_se /
-   adev->gfx.config.max_sh_per_se;
+   u32 active_rb_bitmap = 0;
+   u32 max_sa;
+   u32 active_sa_bitmap;
+   u32 global_active_rb_bitmap;
+   u32 i;
 
-   mutex_lock(>grbm_idx_mutex);
-   for (i = 0; i < adev->gfx.config.max_shader_engines; i++) {
-   for (j = 0; j < adev->gfx.config.max_sh_per_se; j++) {
-   gfx_v11_0_select_se_sh(adev, i, j, 0x);
-   data = gfx_v11_0_get_rb_active_bitmap(adev);
-   active_rbs |= data << ((i * 
adev->gfx.config.max_sh_per_se + j) *
-  rb_bitmap_width_per_sh);
-   }
+   /* query sa bitmap from SA_UNIT_DISABLE registers */
+   active_sa_bitmap = gfx_v11_0_get_sa_active_bitmap(adev);
+   /* query rb bitmap from RB_BACKEND_DISABLE registers */
+   global_active_rb_bitmap = gfx_v11_0_get_rb_active_bitmap(adev);
+
+   /* generate active rb bitmap according to active sa bitmap */
+   max_sa = adev->gfx.config.max_shader_engines *
+adev->gfx.config.max_sh_per_se;
+   for (i = 0; i < max_sa; i++) {
+   if (active_sa_bitmap & (1 << i))
+   active_rb_bitmap |= 0x3 << (i * 2);
}

[PATCH] drm/amdgpu: allow query error counters for specific IP block

2023-01-03 Thread Hawking Zhang

amdgpu_ras_block_late_init will be invoked in IP
specific ras_late_init call as a common helper for
all the IP blocks.

However, when amdgpu_ras_block_late_init call
amdgpu_ras_query_error_count to query ras error
counters, amdgpu_ras_query_error_count queries
all the IP blocks that support ras query interface.

This results to wrong error counters cached in
software copies when there are ras errors detected
at time zero or warm reset procedure. i.e., in
sdma_ras_late_init phase, it counts on sdma/mmhub
errors, while, in mmhub_ras_late_init phase, it
still counts on sdma/mmhub errors.

The change updates amdgpu_ras_query_error_count
interface to allow query specific ip error counter.
It introduces a new input parameter: query_info. if
query_info is NULL,  it means query all the IP blocks,
otherwise, only query the ip block specified by
query_info.

Signed-off-by: Hawking Zhang 
Reviewed-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 89 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  3 +-
 2 files changed, 71 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 35b9f2ed2838..7fed63dc09bf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1130,11 +1130,54 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
 }
 
 /**
- * amdgpu_ras_query_error_count -- Get error counts of all IPs
+ * amdgpu_ras_query_error_count_helper -- Get error counter for specific IP
+ * @adev: pointer to AMD GPU device
+ * @ce_count: pointer to an integer to be set to the count of correctible 
errors.
+ * @ue_count: pointer to an integer to be set to the count of uncorrectible 
errors.
+ * @query_info: pointer to ras_query_if
+ *
+ * Return 0 for query success or do nothing, otherwise return an error
+ * on failures
+ */
+static int amdgpu_ras_query_error_count_helper(struct amdgpu_device *adev,
+  unsigned long *ce_count,
+  unsigned long *ue_count,
+  struct ras_query_if *query_info)
+{
+   int ret;
+
+   if (!query_info)
+   /* do nothing if query_info is not specified */
+   return 0;
+
+   ret = amdgpu_ras_query_error_status(adev, query_info);
+   if (ret)
+   return ret;
+
+   *ce_count += query_info->ce_count;
+   *ue_count += query_info->ue_count;
+
+   /* some hardware/IP supports read to clear
+* no need to explictly reset the err status after the query call */
+   if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
+   adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
+   if (amdgpu_ras_reset_error_status(adev, query_info->head.block))
+   dev_warn(adev->dev,
+"Failed to reset error counter and error 
status\n");
+   }
+
+   return 0;
+}
+
+/**
+ * amdgpu_ras_query_error_count -- Get error counts of all IPs or specific IP
  * @adev: pointer to AMD GPU device
  * @ce_count: pointer to an integer to be set to the count of correctible 
errors.
  * @ue_count: pointer to an integer to be set to the count of uncorrectible
  * errors.
+ * @query_info: pointer to ras_query_if if the query request is only for
+ * specific ip block; if info is NULL, then the qurey request is for
+ * all the ip blocks that support query ras error counters/status
  *
  * If set, @ce_count or @ue_count, count and return the corresponding
  * error counts in those integer pointers. Return 0 if the device
@@ -1142,11 +1185,13 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
  */
 int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
 unsigned long *ce_count,
-unsigned long *ue_count)
+unsigned long *ue_count,
+struct ras_query_if *query_info)
 {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_manager *obj;
unsigned long ce, ue;
+   int ret;
 
if (!adev->ras_enabled || !con)
return -EOPNOTSUPP;
@@ -1158,26 +1203,23 @@ int amdgpu_ras_query_error_count(struct amdgpu_device 
*adev,
 
ce = 0;
ue = 0;
-   list_for_each_entry(obj, >head, node) {
-   struct ras_query_if info = {
-   .head = obj->head,
-   };
-   int res;
-
-   res = amdgpu_ras_query_error_status(adev, );
-   if (res)
-   return res;
+   if (!query_info) {
+   /* query all the ip blocks that support ras query interface */
+   list_for_each_entry(obj, >head, node) {
+   struct ras_query_if info = {
+

[PATCH] drm/amdgpu: move convert_error_address out of umc_ras

2022-10-14 Thread Hawking Zhang

RAS error address translation algorithm is common
across dGPU and A + A platform as along as the SOC
integrates the same generation of UMC IP.

UMC RAS is managed by x86 MCA on A + A platform,
umc_ras in GPU driver is not initialized at all on
A + A platform. In such case, any umc_ras callback
implemented for dGPU config shouldn't be invoked
from A + A specific callback.

The change moves convert_error_address out of dGPU
umc_ras structure and makes it share between A + A
and dGPU config.

Signed-off-by: Hawking Zhang 
Reviewed-by: Stanley Yang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 16 
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |  3 ---
 drivers/gpu/drm/amd/amdgpu/umc_v6_7.c   |  7 +++
 drivers/gpu/drm/amd/amdgpu/umc_v6_7.h   |  4 +++-
 4 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 75f1402101f4..41b3081fe415 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -36,6 +36,7 @@
 #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
 #include "atom.h"
 #include "amdgpu_reset.h"
+#include "umc_v6_7.h"
 
 #ifdef CONFIG_X86_MCE_AMD
 #include 
@@ -2885,10 +2886,17 @@ static int amdgpu_bad_page_notifier(struct 
notifier_block *nb,
/*
 * Translate UMC channel address to Physical address
 */
-   if (adev->umc.ras &&
-   adev->umc.ras->convert_ras_error_address)
-   adev->umc.ras->convert_ras_error_address(adev,
-   _data, m->addr, ch_inst, umc_inst);
+   switch (adev->ip_versions[UMC_HWIP][0]) {
+   case IP_VERSION(6, 7, 0):
+   umc_v6_7_convert_error_address(adev,
+   _data, m->addr, ch_inst, umc_inst);
+   break;
+   default:
+   dev_warn(adev->dev,
+"UMC address to Physical address translation is not 
supported\n");
+   kfree(err_data.err_addr);
+   return NOTIFY_DONE;
+   }
 
if (amdgpu_bad_page_threshold != 0) {
amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index e46439274f3a..3629d8f292ef 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -51,9 +51,6 @@ struct amdgpu_umc_ras {
struct amdgpu_ras_block_object ras_block;
void (*err_cnt_init)(struct amdgpu_device *adev);
bool (*query_ras_poison_mode)(struct amdgpu_device *adev);
-   void (*convert_ras_error_address)(struct amdgpu_device *adev,
-   struct ras_err_data *err_data, uint64_t 
err_addr,
-   uint32_t ch_inst, uint32_t umc_inst);
void (*ecc_info_query_ras_error_count)(struct amdgpu_device *adev,
  void *ras_error_status);
void (*ecc_info_query_ras_error_address)(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
index 5d5d031c9e7d..72fd963f178b 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
@@ -187,9 +187,9 @@ static void umc_v6_7_ecc_info_query_ras_error_count(struct 
amdgpu_device *adev,
}
 }
 
-static void umc_v6_7_convert_error_address(struct amdgpu_device *adev,
-   struct ras_err_data *err_data, uint64_t 
err_addr,
-   uint32_t ch_inst, uint32_t umc_inst)
+void umc_v6_7_convert_error_address(struct amdgpu_device *adev,
+   struct ras_err_data *err_data, uint64_t 
err_addr,
+   uint32_t ch_inst, uint32_t umc_inst)
 {
uint32_t channel_index;
uint64_t soc_pa, retired_page, column;
@@ -553,5 +553,4 @@ struct amdgpu_umc_ras umc_v6_7_ras = {
.query_ras_poison_mode = umc_v6_7_query_ras_poison_mode,
.ecc_info_query_ras_error_count = 
umc_v6_7_ecc_info_query_ras_error_count,
.ecc_info_query_ras_error_address = 
umc_v6_7_ecc_info_query_ras_error_address,
-   .convert_ras_error_address = umc_v6_7_convert_error_address,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h 
b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h
index fe41ed2f5945..105245d5b6e5 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h
@@ -71,5 +71,7 @@ extern const uint32_t

umc_v6_7_channel_idx_tbl_second[UMC_V6_7_UMC_INSTANCE_NUM][UMC_V6_7_CHANNEL_INSTANCE_NUM];
 extern const uint32_t

umc_v6_7_channel_idx_tbl_first[UMC_V6_7_UMC_INSTANCE_NUM][UMC_V6_7_CHANNEL_INSTANCE_NUM];
-
+void umc_v6_7_convert_error_address(struct amdgpu_device *adev,
+struct ras_

[PATCH] drm/amdgpu: move convert_error_address out of umc_ras

2022-10-14 Thread Hawking Zhang

RAS error address translation algorithm is common
across dGPU and A + A platform as along as the SOC
integrates the same generation of UMC IP.

UMC RAS is managed by x86 MCA on A + A platform,
umc_ras in GPU driver is not initialized at all on
A + A platform. In such case, any umc_ras callback
implemented for dGPU config shouldn't be invoked
from A + A specific callback.

The change moves convert_error_address out of dGPU
umc_ras structure and makes it share between A + A
and dGPU config.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 15 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |  3 ---
 drivers/gpu/drm/amd/amdgpu/umc_v6_7.c   |  7 +++
 drivers/gpu/drm/amd/amdgpu/umc_v6_7.h   |  4 +++-
 4 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 75f1402101f4..ff92ea99d513 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -36,6 +36,7 @@
 #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
 #include "atom.h"
 #include "amdgpu_reset.h"
+#include "umc_v6_7.h"
 
 #ifdef CONFIG_X86_MCE_AMD
 #include 
@@ -2885,10 +2886,16 @@ static int amdgpu_bad_page_notifier(struct 
notifier_block *nb,
/*
 * Translate UMC channel address to Physical address
 */
-   if (adev->umc.ras &&
-   adev->umc.ras->convert_ras_error_address)
-   adev->umc.ras->convert_ras_error_address(adev,
-   _data, m->addr, ch_inst, umc_inst);
+   switch (adev->ip_versions[UMC_HWIP][0]) {
+   case IP_VERSION(6, 7, 0):
+   umc_v6_7_convert_error_address(adev,
+   _data, m->addr, ch_inst, umc_inst);
+   break;
+   default:
+   dev_warn(adev->dev,
+"UMC address to Physical address translation is not 
supported\n");
+   return NOTIFY_DONE;
+   }
 
if (amdgpu_bad_page_threshold != 0) {
amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index e46439274f3a..3629d8f292ef 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -51,9 +51,6 @@ struct amdgpu_umc_ras {
struct amdgpu_ras_block_object ras_block;
void (*err_cnt_init)(struct amdgpu_device *adev);
bool (*query_ras_poison_mode)(struct amdgpu_device *adev);
-   void (*convert_ras_error_address)(struct amdgpu_device *adev,
-   struct ras_err_data *err_data, uint64_t 
err_addr,
-   uint32_t ch_inst, uint32_t umc_inst);
void (*ecc_info_query_ras_error_count)(struct amdgpu_device *adev,
  void *ras_error_status);
void (*ecc_info_query_ras_error_address)(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
index 5d5d031c9e7d..72fd963f178b 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c
@@ -187,9 +187,9 @@ static void umc_v6_7_ecc_info_query_ras_error_count(struct 
amdgpu_device *adev,
}
 }
 
-static void umc_v6_7_convert_error_address(struct amdgpu_device *adev,
-   struct ras_err_data *err_data, uint64_t 
err_addr,
-   uint32_t ch_inst, uint32_t umc_inst)
+void umc_v6_7_convert_error_address(struct amdgpu_device *adev,
+   struct ras_err_data *err_data, uint64_t 
err_addr,
+   uint32_t ch_inst, uint32_t umc_inst)
 {
uint32_t channel_index;
uint64_t soc_pa, retired_page, column;
@@ -553,5 +553,4 @@ struct amdgpu_umc_ras umc_v6_7_ras = {
.query_ras_poison_mode = umc_v6_7_query_ras_poison_mode,
.ecc_info_query_ras_error_count = 
umc_v6_7_ecc_info_query_ras_error_count,
.ecc_info_query_ras_error_address = 
umc_v6_7_ecc_info_query_ras_error_address,
-   .convert_ras_error_address = umc_v6_7_convert_error_address,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h 
b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h
index fe41ed2f5945..105245d5b6e5 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h
@@ -71,5 +71,7 @@ extern const uint32_t

umc_v6_7_channel_idx_tbl_second[UMC_V6_7_UMC_INSTANCE_NUM][UMC_V6_7_CHANNEL_INSTANCE_NUM];
 extern const uint32_t

umc_v6_7_channel_idx_tbl_first[UMC_V6_7_UMC_INSTANCE_NUM][UMC_V6_7_CHANNEL_INSTANCE_NUM];
-
+void umc_v6_7_convert_error_address(struct amdgpu_device *adev,
+struct ras_err_data *err_data, uint64_t 
err_addr,
+uint32_t ch_inst, uint32_t umc_inst);
 #endif
-- 
2.17.1

1 2 3 4 >

1 - 100 of 371 matches

Mail list logo