Re: [PATCH v3] drm/amd/amdgpu implement tdr advanced mode

2021-03-08 Thread Andrey Grodzovsky




On 2021-03-08 7:33 a.m., Jack Zhang wrote:

[Why]
Previous tdr design treats the first job in job_timeout as the bad job.
But sometimes a later bad compute job can block a good gfx job and
cause an unexpected gfx job timeout because gfx and compute ring share
internal GC HW mutually.

[How]
This patch implements an advanced tdr mode.
1.It synchronized resubmit wait step to find the real bad job. If the
job's hw fence get timeout, we decrease the old job's karma, treat
the new found one as a guilty one,do hw reset to recover hw. After
that, we conitue the resubmit step to resubmit the left jobs.

2. For whole gpu reset(vram lost), do resubmit as the old style.

Signed-off-by: Jack Zhang 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 57 --
  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c|  2 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c| 33 +
  3 files changed, 88 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e247c3a2ec08..fa53c6c00ee9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4639,7 +4639,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
int i, r = 0;
bool need_emergency_restart = false;
bool audio_suspended = false;
-
+   int tmp_vram_lost_counter;
/*
 * Special case: RAS triggered and full reset isn't supported
 */
@@ -4788,6 +4788,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
}
}
  
+	tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));

/* Actual ASIC resets if needed.*/
/* TODO Implement XGMI hive reset logic for SRIOV */
if (amdgpu_sriov_vf(adev)) {
@@ -4807,17 +4808,67 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
*adev,
  
  		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {

struct amdgpu_ring *ring = tmp_adev->rings[i];
+   int ret = 0;
+   struct drm_sched_job *s_job = NULL;
  
  			if (!ring || !ring->sched.thread)

continue;
  
  			/* No point to resubmit jobs if we didn't HW reset*/

-   if (!tmp_adev->asic_reset_res && !job_signaled)
+   if (!tmp_adev->asic_reset_res && !job_signaled) {
drm_sched_resubmit_jobs(>sched);
  
-			drm_sched_start(>sched, !tmp_adev->asic_reset_res);

+   if (amdgpu_gpu_recovery == 2 &&
+   
!list_empty(>sched->ring_mirror_list)
+   && !(tmp_vram_lost_counter < 
atomic_read(>vram_lost_counter)
+) {
+
+   s_job = 
list_first_entry_or_null(>sched->ring_mirror_list, struct drm_sched_job, 
node);


Seems better to check for NULL here and skip checking for list_empty
above



+   ret = 
dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched->timeout);
+   if (ret == 0) { /* timeout */
+   /*reset karma to the right job*/
+   if (job && job != s_job)
+   
amdgpu_sched_decrease_karma(>base);
+   drm_sched_increase_karma(s_job);
+
+   /* do hw reset */
+   if (amdgpu_sriov_vf(adev)) {
+   
amdgpu_virt_fini_data_exchange(adev);
+   r = 
amdgpu_device_reset_sriov(adev, false);
+   if (r)
+   
adev->asic_reset_res = r;
+   } else {
+   r  = 
amdgpu_do_asic_reset(hive, device_list_handle, _full_reset, false);
+   if (r && r == -EAGAIN)
+   goto retry;
+
+   /* add reset counter so that 
the following resubmitted job could flush vmid */
+   
atomic_inc(_adev->gpu_reset_counter);
+
+   //resubmit again the left jobs
+   
drm_sched_resubmit_jobs(>sched);
+   }
+   }
+   }
+   }
+   if (amdgpu_gpu_recovery != 2)
+ 

[PATCH v3] drm/amd/amdgpu implement tdr advanced mode

2021-03-08 Thread Jack Zhang
[Why]
Previous tdr design treats the first job in job_timeout as the bad job.
But sometimes a later bad compute job can block a good gfx job and
cause an unexpected gfx job timeout because gfx and compute ring share
internal GC HW mutually.

[How]
This patch implements an advanced tdr mode.
1.It synchronized resubmit wait step to find the real bad job. If the
job's hw fence get timeout, we decrease the old job's karma, treat
the new found one as a guilty one,do hw reset to recover hw. After
that, we conitue the resubmit step to resubmit the left jobs.

2. For whole gpu reset(vram lost), do resubmit as the old style.

Signed-off-by: Jack Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 57 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c|  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c| 33 +
 3 files changed, 88 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e247c3a2ec08..fa53c6c00ee9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4639,7 +4639,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
int i, r = 0;
bool need_emergency_restart = false;
bool audio_suspended = false;
-
+   int tmp_vram_lost_counter;
/*
 * Special case: RAS triggered and full reset isn't supported
 */
@@ -4788,6 +4788,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
}
}
 
+   tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
/* Actual ASIC resets if needed.*/
/* TODO Implement XGMI hive reset logic for SRIOV */
if (amdgpu_sriov_vf(adev)) {
@@ -4807,17 +4808,67 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
*adev,
 
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = tmp_adev->rings[i];
+   int ret = 0;
+   struct drm_sched_job *s_job = NULL;
 
if (!ring || !ring->sched.thread)
continue;
 
/* No point to resubmit jobs if we didn't HW reset*/
-   if (!tmp_adev->asic_reset_res && !job_signaled)
+   if (!tmp_adev->asic_reset_res && !job_signaled) {
drm_sched_resubmit_jobs(>sched);
 
-   drm_sched_start(>sched, 
!tmp_adev->asic_reset_res);
+   if (amdgpu_gpu_recovery == 2 &&
+   
!list_empty(>sched->ring_mirror_list)
+   && !(tmp_vram_lost_counter < 
atomic_read(>vram_lost_counter)
+) {
+
+   s_job = 
list_first_entry_or_null(>sched->ring_mirror_list, struct drm_sched_job, 
node);
+   ret = 
dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched->timeout);
+   if (ret == 0) { /* timeout */
+   /*reset karma to the right job*/
+   if (job && job != s_job)
+   
amdgpu_sched_decrease_karma(>base);
+   drm_sched_increase_karma(s_job);
+
+   /* do hw reset */
+   if (amdgpu_sriov_vf(adev)) {
+   
amdgpu_virt_fini_data_exchange(adev);
+   r = 
amdgpu_device_reset_sriov(adev, false);
+   if (r)
+   
adev->asic_reset_res = r;
+   } else {
+   r  = 
amdgpu_do_asic_reset(hive, device_list_handle, _full_reset, false);
+   if (r && r == -EAGAIN)
+   goto retry;
+
+   /* add reset counter so that 
the following resubmitted job could flush vmid */
+   
atomic_inc(_adev->gpu_reset_counter);
+
+   //resubmit again the left jobs
+   
drm_sched_resubmit_jobs(>sched);
+   }
+   }
+   }
+   }
+   if (amdgpu_gpu_recovery != 2)
+   drm_sched_start(>sched,