Re: [PATCH] drm/amdgpu: Use driver mode reset for data poison handling

2024-04-17 Thread Deucher, Alexander
[Public]

Acked-by: Alex Deucher 

From: amd-gfx  on behalf of Hawking 
Zhang 
Sent: Tuesday, April 16, 2024 1:56 AM
To: amd-gfx@lists.freedesktop.org ; Zhou1, Tao 

Cc: Zhang, Hawking 
Subject: [PATCH] drm/amdgpu: Use driver mode reset for data poison handling

mode-2 reset is the only reliable method that can get
GC/SDMA back when poison is consumed. mmhub requires
mode-1 reset.

Signed-off-by: Hawking Zhang 
---
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   | 22 +++
 1 file changed, 3 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index c368c70df3f4a..94eb2493103ef 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -144,7 +144,7 @@ static void event_interrupt_poison_consumption_v9(struct 
kfd_node *dev,
 uint16_t pasid, uint16_t client_id)
 {
 enum amdgpu_ras_block block = 0;
-   int old_poison, ret = -EINVAL;
+   int old_poison;
 uint32_t reset = 0;
 struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);

@@ -163,17 +163,13 @@ static void event_interrupt_poison_consumption_v9(struct 
kfd_node *dev,
 case SOC15_IH_CLIENTID_SE2SH:
 case SOC15_IH_CLIENTID_SE3SH:
 case SOC15_IH_CLIENTID_UTCL2:
-   ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
 block = AMDGPU_RAS_BLOCK__GFX;
-   if (ret)
-   reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+   reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
 break;
 case SOC15_IH_CLIENTID_VMC:
 case SOC15_IH_CLIENTID_VMC1:
-   ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
 block = AMDGPU_RAS_BLOCK__MMHUB;
-   if (ret)
-   reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
+   reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
 break;
 case SOC15_IH_CLIENTID_SDMA0:
 case SOC15_IH_CLIENTID_SDMA1:
@@ -189,18 +185,6 @@ static void event_interrupt_poison_consumption_v9(struct 
kfd_node *dev,

 kfd_signal_poison_consumed_event(dev, pasid);

-   /* resetting queue passes, do page retirement without gpu reset
-* resetting queue fails, fallback to gpu reset solution
-*/
-   if (!ret)
-   dev_warn(dev->adev->dev,
-   "RAS poison consumption, unmap queue flow succeeded: 
client id %d\n",
-   client_id);
-   else
-   dev_warn(dev->adev->dev,
-   "RAS poison consumption, fall back to gpu reset flow: 
client id %d\n",
-   client_id);
-
 amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);
 }

--
2.17.1



RE: [PATCH] drm/amdgpu: Use driver mode reset for data poison handling

2024-04-15 Thread Zhang, Hawking
[AMD Official Use Only - General]

Please ignore this one, will send out a new one

-Original Message-
From: Zhou1, Tao 
Sent: Tuesday, April 16, 2024 01:08
To: Zhang, Hawking ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking 
Subject: RE: [PATCH] drm/amdgpu: Use driver mode reset for data poison handling

[AMD Official Use Only - General]

Reviewed-by: Tao Zhou 

> -Original Message-
> From: Hawking Zhang 
> Sent: Tuesday, April 16, 2024 12:34 PM
> To: amd-gfx@lists.freedesktop.org; Zhou1, Tao 
> Cc: Zhang, Hawking 
> Subject: [PATCH] drm/amdgpu: Use driver mode reset for data poison
> handling
>
> mode-2 reset is the only reliable method that can get GC/SDMA back
> when poison is consumed. mmhub requires
> mode-1 reset.
>
> Signed-off-by: Hawking Zhang 
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 8 ++--
>  1 file changed, 2 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> index c368c70df3f4a..b6caf6eda8a0c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> @@ -163,17 +163,13 @@ static void
> event_interrupt_poison_consumption_v9(struct kfd_node *dev,
>   case SOC15_IH_CLIENTID_SE2SH:
>   case SOC15_IH_CLIENTID_SE3SH:
>   case SOC15_IH_CLIENTID_UTCL2:
> - ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
>   block = AMDGPU_RAS_BLOCK__GFX;
> - if (ret)
> - reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
> + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
>   break;
>   case SOC15_IH_CLIENTID_VMC:
>   case SOC15_IH_CLIENTID_VMC1:
> - ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
>   block = AMDGPU_RAS_BLOCK__MMHUB;
> - if (ret)
> - reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
> + reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
>   break;
>   case SOC15_IH_CLIENTID_SDMA0:
>   case SOC15_IH_CLIENTID_SDMA1:
> --
> 2.17.1




RE: [PATCH] drm/amdgpu: Use driver mode reset for data poison handling

2024-04-15 Thread Zhou1, Tao
[AMD Official Use Only - General]

Reviewed-by: Tao Zhou 

> -Original Message-
> From: Hawking Zhang 
> Sent: Tuesday, April 16, 2024 12:34 PM
> To: amd-gfx@lists.freedesktop.org; Zhou1, Tao 
> Cc: Zhang, Hawking 
> Subject: [PATCH] drm/amdgpu: Use driver mode reset for data poison handling
>
> mode-2 reset is the only reliable method that can get GC/SDMA back when
> poison is consumed. mmhub requires
> mode-1 reset.
>
> Signed-off-by: Hawking Zhang 
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 8 ++--
>  1 file changed, 2 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> index c368c70df3f4a..b6caf6eda8a0c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> @@ -163,17 +163,13 @@ static void
> event_interrupt_poison_consumption_v9(struct kfd_node *dev,
>   case SOC15_IH_CLIENTID_SE2SH:
>   case SOC15_IH_CLIENTID_SE3SH:
>   case SOC15_IH_CLIENTID_UTCL2:
> - ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
>   block = AMDGPU_RAS_BLOCK__GFX;
> - if (ret)
> - reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
> + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
>   break;
>   case SOC15_IH_CLIENTID_VMC:
>   case SOC15_IH_CLIENTID_VMC1:
> - ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
>   block = AMDGPU_RAS_BLOCK__MMHUB;
> - if (ret)
> - reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
> + reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
>   break;
>   case SOC15_IH_CLIENTID_SDMA0:
>   case SOC15_IH_CLIENTID_SDMA1:
> --
> 2.17.1