quit first and try later if gpu_reset is already running, this
way we can handle different jobs hang on different ring and
crash each other on the same time

Change-Id: I0c6bc8d76959c5053e7523c41b2305032fc6b79a
Signed-off-by: Monk Liu <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 15 ++++++++++++---
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 31a5608..9efbb33 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2754,9 +2754,9 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, 
struct amdgpu_job *job)
        struct amdgpu_ring *ring;
        struct dma_fence *fence = NULL, *next = NULL;
 
-       /* other thread is already into the gpu reset so just quit */
+       /* other thread is already into the gpu reset so just quit and come 
later */
        if (!atomic_add_unless(&adev->in_sriov_reset, 1, 1))
-               return 0;
+               return -EAGAIN;
 
        atomic_inc(&adev->gpu_reset_counter);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 4510627..0db81a4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -37,10 +37,19 @@ static void amdgpu_job_timedout(struct amd_sched_job *s_job)
                  atomic_read(&job->ring->fence_drv.last_seq),
                  job->ring->fence_drv.sync_seq);
 
-       if (amdgpu_sriov_vf(job->adev))
-               amdgpu_sriov_gpu_reset(job->adev, job);
-       else
+       if (amdgpu_sriov_vf(job->adev)) {
+               int r;
+
+try_again:
+               r = amdgpu_sriov_gpu_reset(job->adev, job);
+               if (r == -EAGAIN) {
+                       /* maye two different schedulers all have hang job, try 
later */
+                       schedule();
+                       goto try_again;
+               }
+       } else {
                amdgpu_gpu_reset(job->adev);
+       }
 }
 
 int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to