[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Harish Kasiviswanathan <[email protected]>



________________________________
From: Yang, Philip <[email protected]>
Sent: Tuesday, December 9, 2025 10:37 AM
To: [email protected] <[email protected]>
Cc: Kasiviswanathan, Harish <[email protected]>; Lazar, Lijo 
<[email protected]>; Yang, Philip <[email protected]>
Subject: [PATCH] drm/amdkfd: Handle GPU reset and drain retry fault race

Only check and drain IH1 ring if CAM is not enabled.

If GPU is under reset, don't access IH to drain retry fault.

Signed-off-by: Philip Yang <[email protected]>
---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 377dd75f026b..89c5163b867d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -33,6 +33,7 @@
 #include "amdgpu_hmm.h"
 #include "amdgpu.h"
 #include "amdgpu_xgmi.h"
+#include "amdgpu_reset.h"
 #include "kfd_priv.h"
 #include "kfd_svm.h"
 #include "kfd_migrate.h"
@@ -2367,6 +2368,9 @@ static void svm_range_drain_retry_fault(struct 
svm_range_list *svms)

                 pr_debug("drain retry fault gpu %d svms %p\n", i, svms);

+               if (!down_read_trylock(&pdd->dev->adev->reset_domain->sem))
+                       continue;
+
                 amdgpu_ih_wait_on_checkpoint_process_ts(pdd->dev->adev,
                                 pdd->dev->adev->irq.retry_cam_enabled ?
                                 &pdd->dev->adev->irq.ih :
@@ -2376,6 +2380,7 @@ static void svm_range_drain_retry_fault(struct 
svm_range_list *svms)
                         amdgpu_ih_wait_on_checkpoint_process_ts(pdd->dev->adev,
                                 &pdd->dev->adev->irq.ih_soft);

+               up_read(&pdd->dev->adev->reset_domain->sem);

                 pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms);
         }
@@ -2559,7 +2564,7 @@ svm_range_unmap_from_cpu(struct mm_struct *mm, struct 
svm_range *prange,
                 adev = pdd->dev->adev;

                 /* Check and drain ih1 ring if cam not available */
-               if (adev->irq.ih1.ring_size) {
+               if (!adev->irq.retry_cam_enabled && adev->irq.ih1.ring_size) {
                         ih = &adev->irq.ih1;
                         checkpoint_wptr = amdgpu_ih_get_wptr(adev, ih);
                         if (ih->rptr != checkpoint_wptr) {
--
2.50.1

Reply via email to