Re: [PATCH] Revert freesync video patches temporarily

2021-03-06 Thread Jacob, Anson
[AMD Public Use]

Reviewed-by: Anson Jacob 


-- Anson

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] Revert freesync video patches temporarily

2021-03-06 Thread Aurabindo Pillai
This temporarily reverts freesync video patches since it causes regression with
eDP displays. This patch is a squashed revert of the following patches:

a372f4abecb1 drm/amd/display: Skip modeset for front porch change
952bc47fb2ca drm/amd/display: Add freesync video modes based on preferred modes
d03ee581eee6 drm/amd/display: Add module parameter for freesync video mode

Signed-off-by: Aurabindo Pillai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h   |   1 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c   |  12 -
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 369 ++
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h |   2 -
 4 files changed, 35 insertions(+), 349 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 67279a40ab2c..a6b9b6aa5cd1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -185,7 +185,6 @@ extern int amdgpu_emu_mode;
 extern uint amdgpu_smu_memory_pool_size;
 extern int amdgpu_smu_pptable_id;
 extern uint amdgpu_dc_feature_mask;
-extern uint amdgpu_freesync_vid_mode;
 extern uint amdgpu_dc_debug_mask;
 extern uint amdgpu_dm_abm_level;
 extern int amdgpu_backlight;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 865f924772b0..1eca88c23acb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -165,7 +165,6 @@ int amdgpu_mes;
 int amdgpu_noretry = -1;
 int amdgpu_force_asic_type = -1;
 int amdgpu_tmz = -1; /* auto */
-uint amdgpu_freesync_vid_mode;
 int amdgpu_reset_method = -1; /* auto */
 int amdgpu_num_kcq = -1;
 
@@ -823,17 +822,6 @@ module_param_named(backlight, amdgpu_backlight, bint, 
0444);
 MODULE_PARM_DESC(tmz, "Enable TMZ feature (-1 = auto (default), 0 = off, 1 = 
on)");
 module_param_named(tmz, amdgpu_tmz, int, 0444);
 
-/**
- * DOC: freesync_video (uint)
- * Enabled the optimization to adjust front porch timing to achieve seamless 
mode change experience
- * when setting a freesync supported mode for which full modeset is not needed.
- * The default value: 0 (off).
- */
-MODULE_PARM_DESC(
-   freesync_video,
-   "Enable freesync modesetting optimization feature (0 = off (default), 1 
= on)");
-module_param_named(freesync_video, amdgpu_freesync_vid_mode, uint, 0444);
-
 /**
  * DOC: reset_method (int)
  * GPU reset method (-1 = auto (default), 0 = legacy, 1 = mode0, 2 = mode1, 3 
= mode2, 4 = baco, 5 = pci)
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index d559d01b9c27..0085a5e5f342 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -213,9 +213,6 @@ static bool amdgpu_dm_psr_disable_all(struct 
amdgpu_display_manager *dm);
 static const struct drm_format_info *
 amd_get_format_info(const struct drm_mode_fb_cmd2 *cmd);
 
-static bool
-is_timing_unchanged_for_freesync(struct drm_crtc_state *old_crtc_state,
-struct drm_crtc_state *new_crtc_state);
 /*
  * dm_vblank_get_counter
  *
@@ -339,17 +336,6 @@ static inline bool amdgpu_dm_vrr_active(struct 
dm_crtc_state *dm_state)
   dm_state->freesync_config.state == VRR_STATE_ACTIVE_FIXED;
 }
 
-static inline bool is_dc_timing_adjust_needed(struct dm_crtc_state *old_state,
- struct dm_crtc_state *new_state)
-{
-   if (new_state->freesync_config.state ==  VRR_STATE_ACTIVE_FIXED)
-   return true;
-   else if (amdgpu_dm_vrr_active(old_state) != 
amdgpu_dm_vrr_active(new_state))
-   return true;
-   else
-   return false;
-}
-
 /**
  * dm_pflip_high_irq() - Handle pageflip interrupt
  * @interrupt_params: ignored
@@ -5158,16 +5144,19 @@ static void 
fill_stream_properties_from_drm_display_mode(
timing_out->hdmi_vic = hv_frame.vic;
}
 
-   timing_out->h_addressable = mode_in->hdisplay;
-   timing_out->h_total = mode_in->htotal;
-   timing_out->h_sync_width = mode_in->hsync_end - mode_in->hsync_start;
-   timing_out->h_front_porch = mode_in->hsync_start - mode_in->hdisplay;
-   timing_out->v_total = mode_in->vtotal;
-   timing_out->v_addressable = mode_in->vdisplay;
-   timing_out->v_front_porch = mode_in->vsync_start - mode_in->vdisplay;
-   timing_out->v_sync_width = mode_in->vsync_end - mode_in->vsync_start;
-   timing_out->pix_clk_100hz = mode_in->clock * 10;
-
+   timing_out->h_addressable = mode_in->crtc_hdisplay;
+   timing_out->h_total = mode_in->crtc_htotal;
+   timing_out->h_sync_width =
+   mode_in->crtc_hsync_end - mode_in->crtc_hsync_start;
+   timing_out->h_front_porch =
+   mode_in->crtc_hsync_start - mode_in->crtc_hdisplay;
+   timing_out->v_total = mode_in->crtc_vtotal;
+   timing_out->v_addressable = mode_in->crtc_vdisplay;

[PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI hive duirng probe

2021-03-06 Thread shaoyunl
In passthrough configuration, hypervisior will trigger the SBR(Secondary bus 
reset) to the devices
without sync to each other. This could cause device hang since for XGMI 
configuration, all the devices
within the hive need to be reset at a limit time slot. This serial of patches 
try to solve this issue
by co-operate with new SMU which will only do minimum house keeping to response 
the SBR request but don't
do the real reset job and leave it to driver. Driver need to do the whole sw 
init and minimum HW init
to bring up the SMU and trigger the reset(possibly BACO) on all the ASICs at 
the same time

Signed-off-by: shaoyunl 
Change-Id: I34e838e611b7623c7ad824704c7ce350808014fc
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h|  13 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 102 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c|  69 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h|   1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   |   8 +-
 5 files changed, 163 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index d46d3794699e..5602c6edee97 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -125,6 +125,10 @@ struct amdgpu_mgpu_info
uint32_tnum_gpu;
uint32_tnum_dgpu;
uint32_tnum_apu;
+
+   /* delayed reset_func for XGMI configuration if necessary */
+   struct delayed_work delayed_reset_work;
+   boolpending_reset;
 };
 
 #define AMDGPU_MAX_TIMEOUT_PARAM_LENGTH256
@@ -1124,6 +1128,15 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device 
*adev,
 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type);
 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev);
 
+int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
+ struct amdgpu_job *job,
+ bool *need_full_reset_arg);
+
+int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
+ struct list_head *device_list_handle,
+ bool *need_full_reset_arg,
+ bool skip_hw_reset);
+
 int emu_soc_asic_init(struct amdgpu_device *adev);
 
 /*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 3c35b0c1e710..5b520f70e660 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1220,6 +1220,10 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)
}
}
 
+   /* Don't post if we need to reset whole hive on init */
+   if (adev->gmc.xgmi.pending_reset)
+   return false;
+
if (adev->has_hw_reset) {
adev->has_hw_reset = false;
return true;
@@ -2149,6 +2153,9 @@ static int amdgpu_device_fw_loading(struct amdgpu_device 
*adev)
if (adev->ip_blocks[i].version->type != 
AMD_IP_BLOCK_TYPE_PSP)
continue;
 
+   if (!adev->ip_blocks[i].status.sw)
+   continue;
+
/* no need to do the fw loading again if already done*/
if (adev->ip_blocks[i].status.hw == true)
break;
@@ -2289,7 +2296,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device 
*adev)
 
if (adev->gmc.xgmi.num_physical_nodes > 1)
amdgpu_xgmi_add_device(adev);
-   amdgpu_amdkfd_device_init(adev);
+
+   /* Don't init kfd if whole hive need to be reset during init */
+   if (!adev->gmc.xgmi.pending_reset)
+   amdgpu_amdkfd_device_init(adev);
 
amdgpu_fru_get_product_info(adev);
 
@@ -2734,6 +2744,16 @@ static int amdgpu_device_ip_suspend_phase2(struct 
amdgpu_device *adev)
adev->ip_blocks[i].status.hw = false;
continue;
}
+
+   /* skip unnecessary suspend if we do not initialize them yet */
+   if (adev->gmc.xgmi.pending_reset &&
+   !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC 
||
+ adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC 
||
+ adev->ip_blocks[i].version->type == 
AMD_IP_BLOCK_TYPE_COMMON ||
+ adev->ip_blocks[i].version->type == 
AMD_IP_BLOCK_TYPE_IH)) {
+   adev->ip_blocks[i].status.hw = false;
+   continue;
+   }
/* XXX handle errors */
r = adev->ip_blocks[i].version->funcs->suspend(adev);
/* XXX handle errors */
@@ -3407,10 +3427,28 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 *  E.g., driver was not cleanly unloaded 

[PATCH 4/5] drm/amdgpu: Add reset_list for device list used for reset

2021-03-06 Thread shaoyunl
The gmc.xgmi.head list originally is designed for device list in the XGMI hive. 
Mix use it
for reset purpose will prevent the reset function to adjust XGMI device list 
which is required
in next change

Signed-off-by: shaoyunl 
Change-Id: Ibbdf75c02836151adf5bb44186e6ced97dbf8c1d
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h|  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 33 --
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f01b75ec6c60..d46d3794699e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1040,6 +1040,7 @@ struct amdgpu_device {
 
int asic_reset_res;
struct work_struct  xgmi_reset_work;
+   struct list_headreset_list;
 
longgfx_timeout;
longsdma_timeout;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 62d7ce621457..3c35b0c1e710 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3290,6 +3290,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
INIT_LIST_HEAD(>shadow_list);
mutex_init(>shadow_list_lock);
 
+   INIT_LIST_HEAD(>reset_list);
+
INIT_DELAYED_WORK(>delayed_init_work,
  amdgpu_device_delayed_init_work_handler);
INIT_DELAYED_WORK(>gfx.gfx_off_delay_work,
@@ -4301,11 +4303,11 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info 
*hive,
int r = 0;
 
/*
-* ASIC reset has to be done on all HGMI hive nodes ASAP
+* ASIC reset has to be done on all XGMI hive nodes ASAP
 * to allow proper links negotiation in FW (within 1 sec)
 */
if (!skip_hw_reset && need_full_reset) {
-   list_for_each_entry(tmp_adev, device_list_handle, 
gmc.xgmi.head) {
+   list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
/* For XGMI run all resets in parallel to speed up the 
process */
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
if (!queue_work(system_unbound_wq, 
_adev->xgmi_reset_work))
@@ -4322,8 +4324,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info 
*hive,
 
/* For XGMI wait for all resets to complete before proceed */
if (!r) {
-   list_for_each_entry(tmp_adev, device_list_handle,
-   gmc.xgmi.head) {
+   list_for_each_entry(tmp_adev, device_list_handle, 
reset_list) {
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
flush_work(_adev->xgmi_reset_work);
r = tmp_adev->asic_reset_res;
@@ -4335,7 +4336,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info 
*hive,
}
 
if (!r && amdgpu_ras_intr_triggered()) {
-   list_for_each_entry(tmp_adev, device_list_handle, 
gmc.xgmi.head) {
+   list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
if (tmp_adev->mmhub.funcs &&
tmp_adev->mmhub.funcs->reset_ras_error_count)

tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
@@ -4344,7 +4345,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info 
*hive,
amdgpu_ras_intr_cleared();
}
 
-   list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
+   list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
if (need_full_reset) {
/* post card */
if (amdgpu_device_asic_init(tmp_adev))
@@ -4655,16 +4656,18 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
*adev,
 */
INIT_LIST_HEAD(_list);
if (adev->gmc.xgmi.num_physical_nodes > 1) {
-   if (!list_is_first(>gmc.xgmi.head, >device_list))
-   list_rotate_to_front(>gmc.xgmi.head, 
>device_list);
-   device_list_handle = >device_list;
+   list_for_each_entry(tmp_adev, >device_list, gmc.xgmi.head)
+   list_add_tail(_adev->reset_list, _list);
+   if (!list_is_first(>reset_list, _list))
+   list_rotate_to_front(>reset_list, _list);
+   device_list_handle = _list;
} else {
-   list_add_tail(>gmc.xgmi.head, _list);
+   list_add_tail(>reset_list, _list);
device_list_handle = _list;
}
 
/* block all schedulers and reset given job's ring */
-   list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
+   list_for_each_entry(tmp_adev, 

RE: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI hive duirng probe

2021-03-06 Thread Liu, Shaoyun
[AMD Official Use Only - Internal Distribution Only]

It  seems I can  not directly reuse the reset HW  function inside the  
amdgpu_do_asic_reset,  the  synchronization is based on hive->tb,   but as 
explained , we actually don't know the GPU belongs to which hive and will 
rebuild the correct hive info inside the amdgpu_do_asic_reset function with 
amdgpu_xgmi_add_device .  so I need to remove  all GPUs from the hive first . 
This will lead to the sync don't work since the hive->tb will be removed as 
well when all GPUs are removed . 

Thanks 
shaopyunliu

-Original Message-
From: amd-gfx  On Behalf Of Liu, Shaoyun
Sent: Saturday, March 6, 2021 3:41 PM
To: Grodzovsky, Andrey ; 
amd-gfx@lists.freedesktop.org
Subject: RE: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI hive duirng 
probe

[AMD Official Use Only - Internal Distribution Only]

I call the amdgpu_do_asic_reset with the parameter skip_hw_reset = true  so the 
reset won't be execute twice .  but probably I can  set this parameter to true 
and remove the code schedule for reset since now I already build the 
device_list not based on hive. Let me try that . 
For the  schedule delayed work thread with AMDGPU_RESUME_MS, It's actually not 
wait for SMU  to start. As I explained , I need to reset the all the GPUs in 
the system since I don't know which gpus belongs to which hive.  So this time 
is allow system to probe all the GPUs  in the system which means when this 
delayed thread starts ,  we can assume all the devices already been  populated 
in mgpu_info.

Regards
Shaoyun.liu

-Original Message-
From: Grodzovsky, Andrey 
Sent: Saturday, March 6, 2021 1:09 AM
To: Liu, Shaoyun ; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI hive duirng 
probe

Thanks for explaining this, one thing I still don't understand is why you 
schedule the reset work explicilty in the begining of 
amdgpu_drv_delayed_reset_work_handler and then also call amdgpu_do_asic_reset 
which will do the same thing too. It looks like the physical reset will execute 
twice for each device.
Another thing is, more like improvement suggestion  - currently you schedule 
delayed_reset_work using AMDGPU_RESUME_MS - so i guesss this should give enough 
time for SMU to start ? Is there maybe a way to instead poll for SMU start 
completion and then execute this - some SMU status registers maybe ? Just to 
avoid relying on this arbitrary value.

Andrey

On 2021-03-05 8:37 p.m., Liu, Shaoyun wrote:
> [AMD Official Use Only - Internal Distribution Only]
> 
> Hi,  Andrey
> The existing reset function (amdgpu_device_gpu_recover or amd do_asic _reset) 
> assumed driver already have  the correct hive info . But in my case, it's  
> not true . The gpus are in a bad state and the XGMI TA  might not functional 
> properly , so driver can  not  get the hive and node info when probe the 
> device .  It means driver even don't know  the device belongs to which hive 
> on a system with multiple hive configuration (ex, 8 gpus in  two hive). The 
> only solution I can think of is let driver trigger the reset on all gpus at 
> the same time after driver do the minimum initialization on the HW ( bring up 
> the  SMU IP)  no matter they belongs to the same hive or not and call 
> amdgpu_xgmi_add_device for each device after re-init .
> The 100 ms delay added after the baco reset . I think they can be removed . 
> let me verify it.
> 
> Regards
> Shaoyun.liu
> 
> 
> 
> -Original Message-
> From: Grodzovsky, Andrey 
> Sent: Friday, March 5, 2021 2:27 PM
> To: Liu, Shaoyun ; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI 
> hive duirng probe
> 
> 
> 
> On 2021-03-05 12:52 p.m., shaoyunl wrote:
>> In passthrough configuration, hypervisior will trigger the 
>> SBR(Secondary bus reset) to the devices without sync to each other.
>> This could cause device hang since for XGMI configuration, all the 
>> devices within the hive need to be reset at a limit time slot. This 
>> serial of patches try to solve this issue by co-operate with new SMU 
>> which will only do minimum house keeping to response the SBR request 
>> but don't do the real reset job and leave it to driver. Driver need 
>> to do the whole sw init and minimum HW init to bring up the SMU and 
>> trigger the reset(possibly BACO) on all the ASICs at the same time
>>
>> Signed-off-by: shaoyunl 
>> Change-Id: I34e838e611b7623c7ad824704c7ce350808014fc
>> ---
>>drivers/gpu/drm/amd/amdgpu/amdgpu.h|  13 +++
>>drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 102 +++--
>>drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c|  71 ++
>>drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h|   1 +
>>drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   |   8 +-
>>5 files changed, 165 insertions(+), 30 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 

RE: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI hive duirng probe

2021-03-06 Thread Liu, Shaoyun
[AMD Official Use Only - Internal Distribution Only]

I call the amdgpu_do_asic_reset with the parameter skip_hw_reset = true  so the 
reset won't be execute twice .  but probably I can  set this parameter to true 
and remove the code schedule for reset since now I already build the 
device_list not based on hive. Let me try that . 
For the  schedule delayed work thread with AMDGPU_RESUME_MS, It's actually not 
wait for SMU  to start. As I explained , I need to reset the all the GPUs in 
the system since I don't know which gpus belongs to which hive.  So this time 
is allow system to probe all the GPUs  in the system which means when this 
delayed thread starts ,  we can assume all the devices already been  populated 
in mgpu_info.

Regards
Shaoyun.liu

-Original Message-
From: Grodzovsky, Andrey  
Sent: Saturday, March 6, 2021 1:09 AM
To: Liu, Shaoyun ; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI hive duirng 
probe

Thanks for explaining this, one thing I still don't understand is why you 
schedule the reset work explicilty in the begining of 
amdgpu_drv_delayed_reset_work_handler and then also call amdgpu_do_asic_reset 
which will do the same thing too. It looks like the physical reset will execute 
twice for each device.
Another thing is, more like improvement suggestion  - currently you schedule 
delayed_reset_work using AMDGPU_RESUME_MS - so i guesss this should give enough 
time for SMU to start ? Is there maybe a way to instead poll for SMU start 
completion and then execute this - some SMU status registers maybe ? Just to 
avoid relying on this arbitrary value.

Andrey

On 2021-03-05 8:37 p.m., Liu, Shaoyun wrote:
> [AMD Official Use Only - Internal Distribution Only]
> 
> Hi,  Andrey
> The existing reset function (amdgpu_device_gpu_recover or amd do_asic _reset) 
> assumed driver already have  the correct hive info . But in my case, it's  
> not true . The gpus are in a bad state and the XGMI TA  might not functional 
> properly , so driver can  not  get the hive and node info when probe the 
> device .  It means driver even don't know  the device belongs to which hive 
> on a system with multiple hive configuration (ex, 8 gpus in  two hive). The 
> only solution I can think of is let driver trigger the reset on all gpus at 
> the same time after driver do the minimum initialization on the HW ( bring up 
> the  SMU IP)  no matter they belongs to the same hive or not and call 
> amdgpu_xgmi_add_device for each device after re-init .
> The 100 ms delay added after the baco reset . I think they can be removed . 
> let me verify it.
> 
> Regards
> Shaoyun.liu
> 
> 
> 
> -Original Message-
> From: Grodzovsky, Andrey 
> Sent: Friday, March 5, 2021 2:27 PM
> To: Liu, Shaoyun ; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI 
> hive duirng probe
> 
> 
> 
> On 2021-03-05 12:52 p.m., shaoyunl wrote:
>> In passthrough configuration, hypervisior will trigger the 
>> SBR(Secondary bus reset) to the devices without sync to each other.
>> This could cause device hang since for XGMI configuration, all the 
>> devices within the hive need to be reset at a limit time slot. This 
>> serial of patches try to solve this issue by co-operate with new SMU 
>> which will only do minimum house keeping to response the SBR request 
>> but don't do the real reset job and leave it to driver. Driver need 
>> to do the whole sw init and minimum HW init to bring up the SMU and 
>> trigger the reset(possibly BACO) on all the ASICs at the same time
>>
>> Signed-off-by: shaoyunl 
>> Change-Id: I34e838e611b7623c7ad824704c7ce350808014fc
>> ---
>>drivers/gpu/drm/amd/amdgpu/amdgpu.h|  13 +++
>>drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 102 +++--
>>drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c|  71 ++
>>drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h|   1 +
>>drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   |   8 +-
>>5 files changed, 165 insertions(+), 30 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index d46d3794699e..5602c6edee97 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -125,6 +125,10 @@ struct amdgpu_mgpu_info
>>  uint32_tnum_gpu;
>>  uint32_tnum_dgpu;
>>  uint32_tnum_apu;
>> +
>> +/* delayed reset_func for XGMI configuration if necessary */
>> +struct delayed_work delayed_reset_work;
>> +boolpending_reset;
>>};
>>
>>#define AMDGPU_MAX_TIMEOUT_PARAM_LENGTH   256
>> @@ -1124,6 +1128,15 @@ void amdgpu_device_indirect_wreg64(struct 
>> amdgpu_device *adev,
>>bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type);
>>bool amdgpu_device_has_dc_support(struct amdgpu_device 

Re: [PATCH v2] drm/amd/amdgpu implement tdr advanced mode

2021-03-06 Thread Christian König

Hi Jack,

well first of all please completely drop the affinity group stuff from 
this patch. We should concentrate on one feature at at time.


Then the implementation is way to complicate. All you need to do is 
insert a dma_fence_wait after re-scheduling each job after a reset.


Additional to that this feature is completely AMD specific and shouldn't 
affect the common scheduler in any way.


Regards,
Christian.

Am 06.03.21 um 18:25 schrieb Jack Zhang:

[Why]
Previous tdr design treats the first job in job_timeout as the bad job.
But sometimes a later bad compute job can block a good gfx job and
cause an unexpected gfx job timeout because gfx and compute ring share
internal GC HW mutually.

[How]
This patch implements an advanced tdr mode.It involves an additinal
synchronous pre-resubmit step(Step0 Resubmit) before normal resubmit
step in order to find the real bad job.

1. For Bailing TDR job, re-insert it to mirror_list, don't set it to
guilty and leave it to be handled by the main reset thread.

2. Don't set the job to guilty in pre_asic_reset, and leave it to be
handled by Step0 Resubmit Stage.

3. At Step0 Resubmit stage, it first resubmit jobs asynchronously,
then it iterate each ring mirror_list, synchronously pend for each hw
fence being signaled. If the a job's hw fence get timeout, we identify
it as guilty and do hw reset to recover hw. After that, we would do the
normal resubmit step to resubmit left jobs.

4. For whole gpu reset(vram lost), skip Step0 Resubmit as each job
after vram lost was considered as bad job.

5. Involve the concept "Affinity Group".
Doing two hw resets is not necessary when there's only one ring that
has jobs among some hw-related rings.Thus, we involve "affinity group".
Hw-related rings could be added into a common affinity group, such as
gfx and compute ring. When tdr happens, we iterate all rings in
affinity group, skip Step0 Resubmit stage if there's only one ring's
mirror_list that has valid sched jobs.

V2:
 -fix a cherry-pick mistake for bailing TDR handling.

 -do affinity_group check according to the bad job's sched rather
  than the default "1" so that there could be multiple affinity
  groups being pre-defined in future.

Signed-off-by: Jack Zhang 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 102 +++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c|   2 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c|  47 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.h|   2 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c   |  27 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |   1 +
  include/drm/gpu_scheduler.h|   1 +
  7 files changed, 173 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e247c3a2ec08..8632d7071292 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4188,6 +4188,37 @@ bool amdgpu_device_has_job_running(struct amdgpu_device 
*adev)
return false;
  }
  
+bool amdgpu_affinity_group_has_only_or_null_working_ring(struct amdgpu_device *adev, struct drm_sched_job *s_job)

+{
+   int i;
+   int working_ring_num = 0;
+
+   /*
+* The job is considered as the real bad one
+* if job's sched is not in affinity group
+*/
+   if (s_job->sched.affinity_group == 0)
+   return true;
+
+   for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+   struct amdgpu_ring *ring = adev->rings[i];
+
+   if (!ring || !ring->sched.thread)
+   continue;
+
+   /* for non-empty affinity ring, increase working_ring_num */
+   if (ring->sched.affinity_group == s_job->sched.affinity_group) {
+   if (!list_empty(>sched.ring_mirror_list))
+   working_ring_num++;
+   }
+   }
+
+   if (working_ring_num > 1) {
+   return false;
+   }
+   return true;
+}
+
  /**
   * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
   *
@@ -4310,8 +4341,10 @@ static int amdgpu_device_pre_asic_reset(struct 
amdgpu_device *adev,
amdgpu_fence_driver_force_completion(ring);
}
  
-	if(job)

-   drm_sched_increase_karma(>base);
+   if (amdgpu_gpu_recovery != 2) {
+   if (job)
+   drm_sched_increase_karma(>base);
+   }
  
  	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */

if (!amdgpu_sriov_vf(adev)) {
@@ -4639,7 +4672,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
int i, r = 0;
bool need_emergency_restart = false;
bool audio_suspended = false;
-
+   int tmp_vram_lost_counter;
/*
 * Special case: RAS triggered and full reset isn't supported
 */
@@ -4690,8 +4723,16 @@ int 

[PATCH v2] drm/amd/amdgpu implement tdr advanced mode

2021-03-06 Thread Jack Zhang
[Why]
Previous tdr design treats the first job in job_timeout as the bad job.
But sometimes a later bad compute job can block a good gfx job and
cause an unexpected gfx job timeout because gfx and compute ring share
internal GC HW mutually.

[How]
This patch implements an advanced tdr mode.It involves an additinal
synchronous pre-resubmit step(Step0 Resubmit) before normal resubmit
step in order to find the real bad job.

1. For Bailing TDR job, re-insert it to mirror_list, don't set it to
guilty and leave it to be handled by the main reset thread.

2. Don't set the job to guilty in pre_asic_reset, and leave it to be
handled by Step0 Resubmit Stage.

3. At Step0 Resubmit stage, it first resubmit jobs asynchronously,
then it iterate each ring mirror_list, synchronously pend for each hw
fence being signaled. If the a job's hw fence get timeout, we identify
it as guilty and do hw reset to recover hw. After that, we would do the
normal resubmit step to resubmit left jobs.

4. For whole gpu reset(vram lost), skip Step0 Resubmit as each job
after vram lost was considered as bad job.

5. Involve the concept "Affinity Group".
Doing two hw resets is not necessary when there's only one ring that
has jobs among some hw-related rings.Thus, we involve "affinity group".
Hw-related rings could be added into a common affinity group, such as
gfx and compute ring. When tdr happens, we iterate all rings in
affinity group, skip Step0 Resubmit stage if there's only one ring's
mirror_list that has valid sched jobs.

V2:
-fix a cherry-pick mistake for bailing TDR handling.

-do affinity_group check according to the bad job's sched rather
 than the default "1" so that there could be multiple affinity
 groups being pre-defined in future.

Signed-off-by: Jack Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 102 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c|   2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c|  47 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.h|   2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c   |  27 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |   1 +
 include/drm/gpu_scheduler.h|   1 +
 7 files changed, 173 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e247c3a2ec08..8632d7071292 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4188,6 +4188,37 @@ bool amdgpu_device_has_job_running(struct amdgpu_device 
*adev)
return false;
 }
 
+bool amdgpu_affinity_group_has_only_or_null_working_ring(struct amdgpu_device 
*adev, struct drm_sched_job *s_job)
+{
+   int i;
+   int working_ring_num = 0;
+
+   /*
+* The job is considered as the real bad one
+* if job's sched is not in affinity group
+*/
+   if (s_job->sched.affinity_group == 0)
+   return true;
+
+   for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+   struct amdgpu_ring *ring = adev->rings[i];
+
+   if (!ring || !ring->sched.thread)
+   continue;
+
+   /* for non-empty affinity ring, increase working_ring_num */
+   if (ring->sched.affinity_group == s_job->sched.affinity_group) {
+   if (!list_empty(>sched.ring_mirror_list))
+   working_ring_num++;
+   }
+   }
+
+   if (working_ring_num > 1) {
+   return false;
+   }
+   return true;
+}
+
 /**
  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
  *
@@ -4310,8 +4341,10 @@ static int amdgpu_device_pre_asic_reset(struct 
amdgpu_device *adev,
amdgpu_fence_driver_force_completion(ring);
}
 
-   if(job)
-   drm_sched_increase_karma(>base);
+   if (amdgpu_gpu_recovery != 2) {
+   if (job)
+   drm_sched_increase_karma(>base);
+   }
 
/* Don't suspend on bare metal if we are not going to HW reset the ASIC 
*/
if (!amdgpu_sriov_vf(adev)) {
@@ -4639,7 +4672,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
int i, r = 0;
bool need_emergency_restart = false;
bool audio_suspended = false;
-
+   int tmp_vram_lost_counter;
/*
 * Special case: RAS triggered and full reset isn't supported
 */
@@ -4690,8 +4723,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
job ? job->base.id : -1);
 
/* even we skipped this reset, still need to set the job to 
guilty */
-   if (job)
-   drm_sched_increase_karma(>base);
+   if (job) {
+   if (amdgpu_gpu_recovery == 2) {
+   if (>base) {
+   
spin_lock(>base.sched->job_list_lock);

Re: [PATCH] drm/amdgpu: add ih waiter on process until checkpoint

2021-03-06 Thread Christian König



Am 05.03.21 um 22:34 schrieb Kim, Jonathan:

[AMD Official Use Only - Internal Distribution Only]


-Original Message-
From: Christian König 
Sent: Friday, March 5, 2021 3:18 PM
To: Kim, Jonathan ; amd-
g...@lists.freedesktop.org
Cc: Yang, Philip ; Kuehling, Felix
; Koenig, Christian 
Subject: Re: [PATCH] drm/amdgpu: add ih waiter on process until checkpoint

[CAUTION: External Email]

Am 05.03.21 um 21:11 schrieb Jonathan Kim:

Add IH function to allow caller to wait until ring entries are
processed until the checkpoint write pointer.

This will be primarily used by HMM to drain pending page fault
interrupts before memory unmap to prevent HMM from handling stale

interrupts.

v2: Update title and description to clarify use.
Add rptr/wptr wrap counter checks to guarantee ring entries are
processed until the checkpoint.

First of all as I said please use a wait_event, busy waiting is a clear NAK.

Who would do the wake though?  Are you suggesting wake be done in 
amdgpu_ih_process?  Or is waiting happening by the caller and this should go 
somewhere higher (like amdgpu_amdkfd for example)?


Signed-off-by: Jonathan Kim 
---
   drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c | 68

+-

   drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h |  2 +
   2 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
index dc852af4f3b7..954518b4fb79 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
@@ -22,7 +22,7 @@
*/

   #include 
-
+#include 
   #include "amdgpu.h"
   #include "amdgpu_ih.h"

@@ -160,6 +160,72 @@ void amdgpu_ih_ring_write(struct

amdgpu_ih_ring *ih, const uint32_t *iv,

   }
   }

+/**
+ * amdgpu_ih_wait_on_checkpoint_process - wait to process IVs up to
+checkpoint
+ *
+ * @adev: amdgpu_device pointer
+ * @ih: ih ring to process
+ *
+ * Used to ensure ring has processed IVs up to the checkpoint write

pointer.

+ */
+int amdgpu_ih_wait_on_checkpoint_process(struct amdgpu_device

*adev,

+ struct amdgpu_ih_ring *ih) {
+ u64 rptr_check, wptr_check, rptr_wrap = 0, wptr_wrap = 0;
+ u32 prev_rptr, prev_wptr;
+
+ if (!ih->enabled || adev->shutdown)
+ return -ENODEV;
+
+ prev_wptr = amdgpu_ih_get_wptr(adev, ih);
+ /* Order wptr with rptr. */
+ rmb();
+ prev_rptr = READ_ONCE(ih->rptr);
+ rptr_check = prev_rptr | (rptr_wrap << 32);
+ wptr_check = prev_wptr | (wptr_wrap << 32);

Hui what? That check doesn't even make remotely sense to me.

Can you clarify what you meant by creating a new 64 bit compare?
Snip from your last response:

"This way you do something like this:
1. get the wrap around counter.
2. get wptr
3. get rptr
4. compare the wrap around counter with the old one, if it has changed start 
over with #1
5. Use wrap around counter and rptr/wptr to come up with 64bit values.
6. Compare wptr with rptr/wrap around counter until we are sure the IHs are 
processed."

 From a discussion with Felix, I interpreted this as a way to guarantee 
rptr/wtpr ordering so that rptr monotonically follows wptr per check.
I'm assuming rptr/wptrs are 32 bits wide by the use of ptr_mask on read/write 
functions so a respective mask of rptr/wptr wrap count to the top 32 bits would 
mark how far apart the rptr and wptr are per check.


Mhm, sounds like my description was a bit confusing. Let me try again.

First of all rptr/wptr are not 32bit, their maximum is 20 or 19 bits 
IIRC (and they are dw, so 4M or 2M bytes).


Then the purpose of the wait_event() is to wait for changes of the rptr, 
so the matching wake_up() should be at the same place as calling 
amdgpu_ih_set_rptr().


My original idea of the wrap around counter assumes that the counter is 
updated in amdgpu_ih_process(). That isn't strictly necessary, but it 
could be better even if it adds some overhead to IH processing.


If you want to implement it like below it should look something like this:

uint32_t rptr, wptr, tmp;

wptr = amdgpu_ih_get_wptr(adev, ih);
rmb();
rptr = READ_ONCE(ih->rptr);

if (rptr > wptr)
    rptr += ih->ptr_mask + 1;

wait_event(ig->processing, {
    tmp = amdgpu_ih_get_wptr(adev, ih);
    tmp |= wptr & ~ih->ptr_mask;
    if (tmp < wptr)
        tmp += ih->ptr_mask + 1;
    wptr = tmp;
    wptr >= rptr;
})

I would put the condition of the wait_event into a separate function to 
make it more readable and not use GCC extension, but essentially that's it.


The only problem this could have is that the wptr wraps around multiple 
times between wake ups. To handle this as well we would need to 
increment the wrap around counter in amdgpu_ih_process() as well, but 
this means some extra overhead during IH processing. And then I would 
also use 64bit counters to make the stuff easier to handle.


Regards,
Christian.



Thanks,

Jon


Christian.


+
+ spin_begin();
+ while (true) {
+ bool 

[PATCH] drm/amd/amdgpu implement tdr advanced mode

2021-03-06 Thread Jack Zhang
[Why]
Previous tdr design treats the first job in job_timeout as the bad job.
But sometimes a later bad compute job can block a good gfx job and
cause an unexpected gfx job timeout because gfx and compute ring share
internal GC HW mutually.

[How]
This patch implements an advanced tdr mode.It involves an additinal
synchronous pre-resubmit step(Step0 Resubmit) before normal resubmit
step in order to find the real bad job.

1. For Bailing TDR job, re-insert it to mirror_list, don't set it to
guilty and leave it to be handled by the main reset thread.

2. Don't set the job to guilty in pre_asic_reset, and leave it to be
handled by Step0 Resubmit Stage.

3. At Step0 Resubmit stage, it first resubmit jobs asynchronously,
then it iterate each ring mirror_list, synchronously pend for each hw
fence being signaled. If the a job's hw fence get timeout, we identify
it as guilty and do hw reset to recover hw. After that, we would do the
normal resubmit step to resubmit left jobs.

4. For whole gpu reset(vram lost), skip Step0 Resubmit as each job
after vram lost was considered as bad job.

5. Involve the concept "Affinity Group".
Doing two hw resets is not necessary when there's only one ring that
has jobs among some hw-related rings.Thus, we involve "affinity group".
Hw-related rings could be added into a common affinity group, such as
gfx and compute ring. When tdr happens, we iterate all rings in
affinity group, skip Step0 Resubmit stage if there's only one ring's
mirror_list that has valid sched jobs.

Signed-off-by: Jack Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 101 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c|   2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c|  47 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.h|   2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c   |  27 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |   1 +
 include/drm/gpu_scheduler.h|   1 +
 7 files changed, 174 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e247c3a2ec08..877ab6c508e8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4188,6 +4188,30 @@ bool amdgpu_device_has_job_running(struct amdgpu_device 
*adev)
return false;
 }
 
+bool amdgpu_affinity_group_has_only_or_null_working_ring(struct amdgpu_device 
*adev)
+{
+   int i;
+   int working_ring_num = 0;
+
+   for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+   struct amdgpu_ring *ring = adev->rings[i];
+
+   if (!ring || !ring->sched.thread)
+   continue;
+
+   /* for non-empty affinity ring, increase working_ring_num */
+   if (ring->sched.affinity_group == 1) {
+   if (!list_empty(>sched.ring_mirror_list))
+   working_ring_num++;
+   }
+   }
+
+   if (working_ring_num > 1) {
+   return false;
+   }
+   return true;
+}
+
 /**
  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
  *
@@ -4310,8 +4334,10 @@ static int amdgpu_device_pre_asic_reset(struct 
amdgpu_device *adev,
amdgpu_fence_driver_force_completion(ring);
}
 
-   if(job)
-   drm_sched_increase_karma(>base);
+   if (amdgpu_gpu_recovery != 2) {
+   if (job)
+   drm_sched_increase_karma(>base);
+   }
 
/* Don't suspend on bare metal if we are not going to HW reset the ASIC 
*/
if (!amdgpu_sriov_vf(adev)) {
@@ -4639,7 +4665,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
int i, r = 0;
bool need_emergency_restart = false;
bool audio_suspended = false;
-
+   int tmp_vram_lost_counter;
/*
 * Special case: RAS triggered and full reset isn't supported
 */
@@ -4706,6 +4732,24 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
list_rotate_to_front(>gmc.xgmi.head, 
>device_list);
device_list_handle = >device_list;
} else {
+   /* if current dev is already in reset, skip adding list to 
prevent race issue */
+   if (!amdgpu_device_lock_adev(adev, hive)) {
+   dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as 
another already in progress",
+   job ? job->base.id : -1);
+   r = 0;
+   /* even we skipped this reset, still need to set the 
job to guilty */
+   if (amdgpu_gpu_recovery == 2) {
+   if (>base) {
+   
spin_lock(>base.sched->job_list_lock);
+   list_add(>base.node, 
>base.sched->ring_mirror_list);
+