RE: [PATCH 3/3] drm/amdgpu: Centralize ras cap query to amdgpu_ras_check_supported
[AMD Official Use Only - General] The series is: Reviewed-by: Tao Zhou > -Original Message- > From: Hawking Zhang > Sent: Tuesday, January 2, 2024 10:16 PM > To: amd-gfx@lists.freedesktop.org; Zhou1, Tao ; Yang, > Stanley ; Wang, Yang(Kevin) > ; Chai, Thomas ; Li, > Candice > Cc: Zhang, Hawking ; Deucher, Alexander > ; Lazar, Lijo ; Ma, Le > > Subject: [PATCH 3/3] drm/amdgpu: Centralize ras cap query to > amdgpu_ras_check_supported > > Move ras capablity check to amdgpu_ras_check_supported. > Driver will query ras capablity through psp interace, or vbios interface, or > specific > ip callbacks. > > Signed-off-by: Hawking Zhang > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 170 +--- > 1 file changed, 93 insertions(+), 77 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index a901b00d4949..2ee82baaf7d6 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -39,6 +39,7 @@ > #include "nbio_v7_9.h" > #include "atom.h" > #include "amdgpu_reset.h" > +#include "amdgpu_psp.h" > > #ifdef CONFIG_X86_MCE_AMD > #include > @@ -2680,6 +2681,87 @@ static void amdgpu_ras_get_quirks(struct > amdgpu_device *adev) > adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX); } > > +/* Query ras capablity via atomfirmware interface */ static void > +amdgpu_ras_query_ras_capablity_from_vbios(struct amdgpu_device *adev) { > + /* mem_ecc cap */ > + if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { > + dev_info(adev->dev, "MEM ECC is active.\n"); > + adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC | > + 1 << AMDGPU_RAS_BLOCK__DF); > + } else { > + dev_info(adev->dev, "MEM ECC is not presented.\n"); > + } > + > + /* sram_ecc cap */ > + if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { > + dev_info(adev->dev, "SRAM ECC is active.\n"); > + if (!amdgpu_sriov_vf(adev)) > + adev->ras_hw_enabled |= ~(1 << > AMDGPU_RAS_BLOCK__UMC | > + 1 << > AMDGPU_RAS_BLOCK__DF); > + else > + adev->ras_hw_enabled |= (1 << > AMDGPU_RAS_BLOCK__PCIE_BIF | > + 1 << > AMDGPU_RAS_BLOCK__SDMA | > + 1 << > AMDGPU_RAS_BLOCK__GFX); > + > + /* > + * VCN/JPEG RAS can be supported on both bare metal and > + * SRIOV environment > + */ > + if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(2, 6, > 0) || > + amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, > 0) || > + amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, > 3)) > + adev->ras_hw_enabled |= (1 << > AMDGPU_RAS_BLOCK__VCN | > + 1 << > AMDGPU_RAS_BLOCK__JPEG); > + else > + adev->ras_hw_enabled &= ~(1 << > AMDGPU_RAS_BLOCK__VCN | > + 1 << > AMDGPU_RAS_BLOCK__JPEG); > + > + /* > + * XGMI RAS is not supported if xgmi num physical nodes > + * is zero > + */ > + if (!adev->gmc.xgmi.num_physical_nodes) > + adev->ras_hw_enabled &= ~(1 << > AMDGPU_RAS_BLOCK__XGMI_WAFL); > + } else { > + dev_info(adev->dev, "SRAM ECC is not presented.\n"); > + } > +} > + > +/* Query poison mode from umc/df IP callbacks */ static void > +amdgpu_ras_query_poison_mode(struct amdgpu_device *adev) { > + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > + bool df_poison, umc_poison; > + > + /* poison setting is useless on SRIOV guest */ > + if (amdgpu_sriov_vf(adev) || !con) > + return; > + > + /* Init poison supported flag, the default value is false */ > + if (adev->gmc.xgmi.connected_to_cpu || > + adev->gmc.is_app_apu) { > + /* enabled by default when GPU is connected to CPU */ > + con->poison_supported = true; > + } else if (adev->df.funcs && > + adev->df.funcs->query_ras_poison_mode && > + adev->umc.ras && > +
[PATCH 3/3] drm/amdgpu: Centralize ras cap query to amdgpu_ras_check_supported
Move ras capablity check to amdgpu_ras_check_supported. Driver will query ras capablity through psp interace, or vbios interface, or specific ip callbacks. Signed-off-by: Hawking Zhang --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 170 +--- 1 file changed, 93 insertions(+), 77 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index a901b00d4949..2ee82baaf7d6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -39,6 +39,7 @@ #include "nbio_v7_9.h" #include "atom.h" #include "amdgpu_reset.h" +#include "amdgpu_psp.h" #ifdef CONFIG_X86_MCE_AMD #include @@ -2680,6 +2681,87 @@ static void amdgpu_ras_get_quirks(struct amdgpu_device *adev) adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX); } +/* Query ras capablity via atomfirmware interface */ +static void amdgpu_ras_query_ras_capablity_from_vbios(struct amdgpu_device *adev) +{ + /* mem_ecc cap */ + if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { + dev_info(adev->dev, "MEM ECC is active.\n"); + adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC | +1 << AMDGPU_RAS_BLOCK__DF); + } else { + dev_info(adev->dev, "MEM ECC is not presented.\n"); + } + + /* sram_ecc cap */ + if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { + dev_info(adev->dev, "SRAM ECC is active.\n"); + if (!amdgpu_sriov_vf(adev)) + adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC | + 1 << AMDGPU_RAS_BLOCK__DF); + else + adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF | +1 << AMDGPU_RAS_BLOCK__SDMA | +1 << AMDGPU_RAS_BLOCK__GFX); + + /* +* VCN/JPEG RAS can be supported on both bare metal and +* SRIOV environment +*/ + if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(2, 6, 0) || + amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 0) || + amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 3)) + adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN | +1 << AMDGPU_RAS_BLOCK__JPEG); + else + adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN | + 1 << AMDGPU_RAS_BLOCK__JPEG); + + /* +* XGMI RAS is not supported if xgmi num physical nodes +* is zero +*/ + if (!adev->gmc.xgmi.num_physical_nodes) + adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__XGMI_WAFL); + } else { + dev_info(adev->dev, "SRAM ECC is not presented.\n"); + } +} + +/* Query poison mode from umc/df IP callbacks */ +static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + bool df_poison, umc_poison; + + /* poison setting is useless on SRIOV guest */ + if (amdgpu_sriov_vf(adev) || !con) + return; + + /* Init poison supported flag, the default value is false */ + if (adev->gmc.xgmi.connected_to_cpu || + adev->gmc.is_app_apu) { + /* enabled by default when GPU is connected to CPU */ + con->poison_supported = true; + } else if (adev->df.funcs && + adev->df.funcs->query_ras_poison_mode && + adev->umc.ras && + adev->umc.ras->query_ras_poison_mode) { + df_poison = + adev->df.funcs->query_ras_poison_mode(adev); + umc_poison = + adev->umc.ras->query_ras_poison_mode(adev); + + /* Only poison is set in both DF and UMC, we can support it */ + if (df_poison && umc_poison) + con->poison_supported = true; + else if (df_poison != umc_poison) + dev_warn(adev->dev, + "Poison setting is inconsistent in DF/UMC(%d:%d)!\n", + df_poison, umc_poison); + } +} + /* * check hardware's ras ability which will be saved in hw_supported. * if hardware does not support ras, we can skip some ras initializtion and @@ -2696,49 +2778,13 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) if (!amdgpu_ras_asic_supported(adev)) return; - if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { - if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { -
RE: [PATCH 3/3] drm/amdgpu: Centralize ras cap query to amdgpu_ras_check_supported
[AMD Official Use Only - General] The series is: Reviewed-by: Tao Zhou > -Original Message- > From: amd-gfx On Behalf Of Hawking > Zhang > Sent: Tuesday, January 2, 2024 11:45 AM > To: amd-gfx@lists.freedesktop.org; Zhou1, Tao ; Yang, > Stanley ; Wang, Yang(Kevin) > ; Chai, Thomas ; Li, > Candice > Cc: Deucher, Alexander ; Ma, Le > ; Lazar, Lijo ; Zhang, Hawking > > Subject: [PATCH 3/3] drm/amdgpu: Centralize ras cap query to > amdgpu_ras_check_supported > > Move ras capablity check to amdgpu_ras_check_supported. > Driver will query ras capablity through psp interace, or vbios interface, or > specific > ip callbacks. > > Signed-off-by: Hawking Zhang > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 170 +--- > 1 file changed, 93 insertions(+), 77 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 5f302b7693b3..72b6e41329b0 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -39,6 +39,7 @@ > #include "nbio_v7_9.h" > #include "atom.h" > #include "amdgpu_reset.h" > +#include "amdgpu_psp.h" > > #ifdef CONFIG_X86_MCE_AMD > #include > @@ -2680,6 +2681,87 @@ static void amdgpu_ras_get_quirks(struct > amdgpu_device *adev) > adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX); } > > +/* Query ras capablity via atomfirmware interface */ static void > +amdgpu_ras_query_ras_capablity_from_vbios(struct amdgpu_device *adev) { > + /* mem_ecc cap */ > + if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { > + dev_info(adev->dev, "MEM ECC is active.\n"); > + adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC | > + 1 << AMDGPU_RAS_BLOCK__DF); > + } else { > + dev_info(adev->dev, "MEM ECC is not presented.\n"); > + } > + > + /* sram_ecc cap */ > + if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { > + dev_info(adev->dev, "SRAM ECC is active.\n"); > + if (!amdgpu_sriov_vf(adev)) > + adev->ras_hw_enabled |= ~(1 << > AMDGPU_RAS_BLOCK__UMC | > + 1 << > AMDGPU_RAS_BLOCK__DF); > + else > + adev->ras_hw_enabled |= (1 << > AMDGPU_RAS_BLOCK__PCIE_BIF | > + 1 << > AMDGPU_RAS_BLOCK__SDMA | > + 1 << > AMDGPU_RAS_BLOCK__GFX); > + > + /* > + * VCN/JPEG RAS can be supported on both bare metal and > + * SRIOV environment > + */ > + if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(2, 6, > 0) || > + amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, > 0) || > + amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, > 3)) > + adev->ras_hw_enabled |= (1 << > AMDGPU_RAS_BLOCK__VCN | > + 1 << > AMDGPU_RAS_BLOCK__JPEG); > + else > + adev->ras_hw_enabled &= ~(1 << > AMDGPU_RAS_BLOCK__VCN | > + 1 << > AMDGPU_RAS_BLOCK__JPEG); > + > + /* > + * XGMI RAS is not supported if xgmi num physical nodes > + * is zero > + */ > + if (!adev->gmc.xgmi.num_physical_nodes) > + adev->ras_hw_enabled &= ~(1 << > AMDGPU_RAS_BLOCK__XGMI_WAFL); > + } else { > + dev_info(adev->dev, "SRAM ECC is not presented.\n"); > + } > +} > + > +/* Query poison mode from umc/df IP callbacks */ static void > +amdgpu_ras_query_poison_mode(struct amdgpu_device *adev) { > + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > + bool df_poison, umc_poison; > + > + /* poison setting is useless on SRIOV guest */ > + if (amdgpu_sriov_vf(adev) || !con) > + return; > + > + /* Init poison supported flag, the default value is false */ > + if (adev->gmc.xgmi.connected_to_cpu || > + adev->gmc.is_app_apu) { > + /* enabled by default when GPU is connected to CPU */ > + con->poison_supported = true; > + } else if (adev->df.funcs && > + adev->df.funcs->query_ras_poison_mode && > + adev->umc.ras &
[PATCH 3/3] drm/amdgpu: Centralize ras cap query to amdgpu_ras_check_supported
Move ras capablity check to amdgpu_ras_check_supported. Driver will query ras capablity through psp interace, or vbios interface, or specific ip callbacks. Signed-off-by: Hawking Zhang --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 170 +--- 1 file changed, 93 insertions(+), 77 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 5f302b7693b3..72b6e41329b0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -39,6 +39,7 @@ #include "nbio_v7_9.h" #include "atom.h" #include "amdgpu_reset.h" +#include "amdgpu_psp.h" #ifdef CONFIG_X86_MCE_AMD #include @@ -2680,6 +2681,87 @@ static void amdgpu_ras_get_quirks(struct amdgpu_device *adev) adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX); } +/* Query ras capablity via atomfirmware interface */ +static void amdgpu_ras_query_ras_capablity_from_vbios(struct amdgpu_device *adev) +{ + /* mem_ecc cap */ + if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { + dev_info(adev->dev, "MEM ECC is active.\n"); + adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC | +1 << AMDGPU_RAS_BLOCK__DF); + } else { + dev_info(adev->dev, "MEM ECC is not presented.\n"); + } + + /* sram_ecc cap */ + if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { + dev_info(adev->dev, "SRAM ECC is active.\n"); + if (!amdgpu_sriov_vf(adev)) + adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC | + 1 << AMDGPU_RAS_BLOCK__DF); + else + adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF | +1 << AMDGPU_RAS_BLOCK__SDMA | +1 << AMDGPU_RAS_BLOCK__GFX); + + /* +* VCN/JPEG RAS can be supported on both bare metal and +* SRIOV environment +*/ + if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(2, 6, 0) || + amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 0) || + amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 3)) + adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN | +1 << AMDGPU_RAS_BLOCK__JPEG); + else + adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN | + 1 << AMDGPU_RAS_BLOCK__JPEG); + + /* +* XGMI RAS is not supported if xgmi num physical nodes +* is zero +*/ + if (!adev->gmc.xgmi.num_physical_nodes) + adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__XGMI_WAFL); + } else { + dev_info(adev->dev, "SRAM ECC is not presented.\n"); + } +} + +/* Query poison mode from umc/df IP callbacks */ +static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + bool df_poison, umc_poison; + + /* poison setting is useless on SRIOV guest */ + if (amdgpu_sriov_vf(adev) || !con) + return; + + /* Init poison supported flag, the default value is false */ + if (adev->gmc.xgmi.connected_to_cpu || + adev->gmc.is_app_apu) { + /* enabled by default when GPU is connected to CPU */ + con->poison_supported = true; + } else if (adev->df.funcs && + adev->df.funcs->query_ras_poison_mode && + adev->umc.ras && + adev->umc.ras->query_ras_poison_mode) { + df_poison = + adev->df.funcs->query_ras_poison_mode(adev); + umc_poison = + adev->umc.ras->query_ras_poison_mode(adev); + + /* Only poison is set in both DF and UMC, we can support it */ + if (df_poison && umc_poison) + con->poison_supported = true; + else if (df_poison != umc_poison) + dev_warn(adev->dev, + "Poison setting is inconsistent in DF/UMC(%d:%d)!\n", + df_poison, umc_poison); + } +} + /* * check hardware's ras ability which will be saved in hw_supported. * if hardware does not support ras, we can skip some ras initializtion and @@ -2696,49 +2778,13 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) if (!amdgpu_ras_asic_supported(adev)) return; - if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { - if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { -