Re: [PATCH] drm/amdgpu: fix a race in GPU reset with IB test (v2)

2019-05-31 Thread Christian König

Am 29.05.19 um 20:44 schrieb Alex Deucher:

Split late_init into two functions, one (do_late_init) which
just does the hw init, and late_init which calls do_late_init
and schedules the IB test work.  Call do_late_init in
the GPU reset code to run the init code, but not schedule
the IB test code.  The IB test code is called directly
in the gpu reset code so no need to run the IB tests
in a separate work thread.  If we do, we end up racing.

v2: Rework late_init.  Pull out the mgpu fan boost and xgmi
pstate code into late_init so they get called in all cases.
rename the late_init worker thread to delayed work since it's
just the IB tests now which can happen later.  Schedule the
work at init and resume time.  It's not needed at reset time
because the IB tests are called directly.

Cc: Xinhui Pan 
Signed-off-by: Alex Deucher 


Reviewed-by: Christian König 


---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h|   2 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 116 +++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c|   2 +-
  3 files changed, 61 insertions(+), 59 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index d355e9a09ad1..19a00282e34c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -925,7 +925,7 @@ struct amdgpu_device {
const struct amdgpu_df_funcs*df_funcs;
  
  	/* delayed work_func for deferring clockgating during resume */

-   struct delayed_work late_init_work;
+   struct delayed_work delayed_init_work;
  
  	struct amdgpu_virt	virt;

/* firmware VRAM reservation */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 7a8c2201cd04..d00fd5dd307a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1869,6 +1869,43 @@ static int amdgpu_device_set_pg_state(struct 
amdgpu_device *adev, enum amd_power
return 0;
  }
  
+static int amdgpu_device_enable_mgpu_fan_boost(void)

+{
+   struct amdgpu_gpu_instance *gpu_ins;
+   struct amdgpu_device *adev;
+   int i, ret = 0;
+
+   mutex_lock(&mgpu_info.mutex);
+
+   /*
+* MGPU fan boost feature should be enabled
+* only when there are two or more dGPUs in
+* the system
+*/
+   if (mgpu_info.num_dgpu < 2)
+   goto out;
+
+   for (i = 0; i < mgpu_info.num_dgpu; i++) {
+   gpu_ins = &(mgpu_info.gpu_ins[i]);
+   adev = gpu_ins->adev;
+   if (!(adev->flags & AMD_IS_APU) &&
+   !gpu_ins->mgpu_fan_enabled &&
+   adev->powerplay.pp_funcs &&
+   adev->powerplay.pp_funcs->enable_mgpu_fan_boost) {
+   ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
+   if (ret)
+   break;
+
+   gpu_ins->mgpu_fan_enabled = 1;
+   }
+   }
+
+out:
+   mutex_unlock(&mgpu_info.mutex);
+
+   return ret;
+}
+
  /**
   * amdgpu_device_ip_late_init - run late init for hardware IPs
   *
@@ -1902,11 +1939,15 @@ static int amdgpu_device_ip_late_init(struct 
amdgpu_device *adev)
amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
  
-	queue_delayed_work(system_wq, &adev->late_init_work,

-  msecs_to_jiffies(AMDGPU_RESUME_MS));
-
amdgpu_device_fill_reset_magic(adev);
  
+	r = amdgpu_device_enable_mgpu_fan_boost();

+   if (r)
+   DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
+
+   /* set to low pstate by default */
+   amdgpu_xgmi_set_pstate(adev, 0);
+
return 0;
  }
  
@@ -2005,65 +2046,20 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)

return 0;
  }
  
-static int amdgpu_device_enable_mgpu_fan_boost(void)

-{
-   struct amdgpu_gpu_instance *gpu_ins;
-   struct amdgpu_device *adev;
-   int i, ret = 0;
-
-   mutex_lock(&mgpu_info.mutex);
-
-   /*
-* MGPU fan boost feature should be enabled
-* only when there are two or more dGPUs in
-* the system
-*/
-   if (mgpu_info.num_dgpu < 2)
-   goto out;
-
-   for (i = 0; i < mgpu_info.num_dgpu; i++) {
-   gpu_ins = &(mgpu_info.gpu_ins[i]);
-   adev = gpu_ins->adev;
-   if (!(adev->flags & AMD_IS_APU) &&
-   !gpu_ins->mgpu_fan_enabled &&
-   adev->powerplay.pp_funcs &&
-   adev->powerplay.pp_funcs->enable_mgpu_fan_boost) {
-   ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
-   if (ret)
-   break;
-
-   gpu_ins->mgpu_fan_enabled = 1;
-   }
-   }
-
-out:
-   mutex_unlock(&mgpu_info.mutex);
-
-   return ret;

回复: [PATCH] drm/amdgpu: fix a race in GPU reset with IB test (v2)

2019-05-29 Thread Pan, Xinhui
looks good to me.


发件人: Alex Deucher 
发送时间: 2019年5月30日 2:44
收件人: amd-gfx@lists.freedesktop.org
抄送: Deucher, Alexander; Pan, Xinhui
主题: [PATCH] drm/amdgpu: fix a race in GPU reset with IB test (v2)

Split late_init into two functions, one (do_late_init) which
just does the hw init, and late_init which calls do_late_init
and schedules the IB test work.  Call do_late_init in
the GPU reset code to run the init code, but not schedule
the IB test code.  The IB test code is called directly
in the gpu reset code so no need to run the IB tests
in a separate work thread.  If we do, we end up racing.

v2: Rework late_init.  Pull out the mgpu fan boost and xgmi
pstate code into late_init so they get called in all cases.
rename the late_init worker thread to delayed work since it's
just the IB tests now which can happen later.  Schedule the
work at init and resume time.  It's not needed at reset time
because the IB tests are called directly.

Cc: Xinhui Pan 
Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h|   2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 116 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c|   2 +-
 3 files changed, 61 insertions(+), 59 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index d355e9a09ad1..19a00282e34c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -925,7 +925,7 @@ struct amdgpu_device {
const struct amdgpu_df_funcs*df_funcs;

/* delayed work_func for deferring clockgating during resume */
-   struct delayed_work late_init_work;
+   struct delayed_work delayed_init_work;

struct amdgpu_virt  virt;
/* firmware VRAM reservation */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 7a8c2201cd04..d00fd5dd307a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1869,6 +1869,43 @@ static int amdgpu_device_set_pg_state(struct 
amdgpu_device *adev, enum amd_power
return 0;
 }

+static int amdgpu_device_enable_mgpu_fan_boost(void)
+{
+   struct amdgpu_gpu_instance *gpu_ins;
+   struct amdgpu_device *adev;
+   int i, ret = 0;
+
+   mutex_lock(&mgpu_info.mutex);
+
+   /*
+* MGPU fan boost feature should be enabled
+* only when there are two or more dGPUs in
+* the system
+*/
+   if (mgpu_info.num_dgpu < 2)
+   goto out;
+
+   for (i = 0; i < mgpu_info.num_dgpu; i++) {
+   gpu_ins = &(mgpu_info.gpu_ins[i]);
+   adev = gpu_ins->adev;
+   if (!(adev->flags & AMD_IS_APU) &&
+   !gpu_ins->mgpu_fan_enabled &&
+   adev->powerplay.pp_funcs &&
+   adev->powerplay.pp_funcs->enable_mgpu_fan_boost) {
+   ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
+   if (ret)
+   break;
+
+   gpu_ins->mgpu_fan_enabled = 1;
+   }
+   }
+
+out:
+   mutex_unlock(&mgpu_info.mutex);
+
+   return ret;
+}
+
 /**
  * amdgpu_device_ip_late_init - run late init for hardware IPs
  *
@@ -1902,11 +1939,15 @@ static int amdgpu_device_ip_late_init(struct 
amdgpu_device *adev)
amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);

-   queue_delayed_work(system_wq, &adev->late_init_work,
-  msecs_to_jiffies(AMDGPU_RESUME_MS));
-
amdgpu_device_fill_reset_magic(adev);

+   r = amdgpu_device_enable_mgpu_fan_boost();
+   if (r)
+   DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
+
+   /* set to low pstate by default */
+   amdgpu_xgmi_set_pstate(adev, 0);
+
return 0;
 }

@@ -2005,65 +2046,20 @@ static int amdgpu_device_ip_fini(struct amdgpu_device 
*adev)
return 0;
 }

-static int amdgpu_device_enable_mgpu_fan_boost(void)
-{
-   struct amdgpu_gpu_instance *gpu_ins;
-   struct amdgpu_device *adev;
-   int i, ret = 0;
-
-   mutex_lock(&mgpu_info.mutex);
-
-   /*
-* MGPU fan boost feature should be enabled
-* only when there are two or more dGPUs in
-* the system
-*/
-   if (mgpu_info.num_dgpu < 2)
-   goto out;
-
-   for (i = 0; i < mgpu_info.num_dgpu; i++) {
-   gpu_ins = &(mgpu_info.gpu_ins[i]);
-   adev = gpu_ins->adev;
-   if (!(adev->flags & AMD_IS_APU) &&
-   !gpu_ins->mgpu_fan_enabled &&
-   adev->powerplay.pp_funcs &&
-   adev

[PATCH] drm/amdgpu: fix a race in GPU reset with IB test (v2)

2019-05-29 Thread Alex Deucher
Split late_init into two functions, one (do_late_init) which
just does the hw init, and late_init which calls do_late_init
and schedules the IB test work.  Call do_late_init in
the GPU reset code to run the init code, but not schedule
the IB test code.  The IB test code is called directly
in the gpu reset code so no need to run the IB tests
in a separate work thread.  If we do, we end up racing.

v2: Rework late_init.  Pull out the mgpu fan boost and xgmi
pstate code into late_init so they get called in all cases.
rename the late_init worker thread to delayed work since it's
just the IB tests now which can happen later.  Schedule the
work at init and resume time.  It's not needed at reset time
because the IB tests are called directly.

Cc: Xinhui Pan 
Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h|   2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 116 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c|   2 +-
 3 files changed, 61 insertions(+), 59 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index d355e9a09ad1..19a00282e34c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -925,7 +925,7 @@ struct amdgpu_device {
const struct amdgpu_df_funcs*df_funcs;
 
/* delayed work_func for deferring clockgating during resume */
-   struct delayed_work late_init_work;
+   struct delayed_work delayed_init_work;
 
struct amdgpu_virt  virt;
/* firmware VRAM reservation */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 7a8c2201cd04..d00fd5dd307a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1869,6 +1869,43 @@ static int amdgpu_device_set_pg_state(struct 
amdgpu_device *adev, enum amd_power
return 0;
 }
 
+static int amdgpu_device_enable_mgpu_fan_boost(void)
+{
+   struct amdgpu_gpu_instance *gpu_ins;
+   struct amdgpu_device *adev;
+   int i, ret = 0;
+
+   mutex_lock(&mgpu_info.mutex);
+
+   /*
+* MGPU fan boost feature should be enabled
+* only when there are two or more dGPUs in
+* the system
+*/
+   if (mgpu_info.num_dgpu < 2)
+   goto out;
+
+   for (i = 0; i < mgpu_info.num_dgpu; i++) {
+   gpu_ins = &(mgpu_info.gpu_ins[i]);
+   adev = gpu_ins->adev;
+   if (!(adev->flags & AMD_IS_APU) &&
+   !gpu_ins->mgpu_fan_enabled &&
+   adev->powerplay.pp_funcs &&
+   adev->powerplay.pp_funcs->enable_mgpu_fan_boost) {
+   ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
+   if (ret)
+   break;
+
+   gpu_ins->mgpu_fan_enabled = 1;
+   }
+   }
+
+out:
+   mutex_unlock(&mgpu_info.mutex);
+
+   return ret;
+}
+
 /**
  * amdgpu_device_ip_late_init - run late init for hardware IPs
  *
@@ -1902,11 +1939,15 @@ static int amdgpu_device_ip_late_init(struct 
amdgpu_device *adev)
amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
 
-   queue_delayed_work(system_wq, &adev->late_init_work,
-  msecs_to_jiffies(AMDGPU_RESUME_MS));
-
amdgpu_device_fill_reset_magic(adev);
 
+   r = amdgpu_device_enable_mgpu_fan_boost();
+   if (r)
+   DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
+
+   /* set to low pstate by default */
+   amdgpu_xgmi_set_pstate(adev, 0);
+
return 0;
 }
 
@@ -2005,65 +2046,20 @@ static int amdgpu_device_ip_fini(struct amdgpu_device 
*adev)
return 0;
 }
 
-static int amdgpu_device_enable_mgpu_fan_boost(void)
-{
-   struct amdgpu_gpu_instance *gpu_ins;
-   struct amdgpu_device *adev;
-   int i, ret = 0;
-
-   mutex_lock(&mgpu_info.mutex);
-
-   /*
-* MGPU fan boost feature should be enabled
-* only when there are two or more dGPUs in
-* the system
-*/
-   if (mgpu_info.num_dgpu < 2)
-   goto out;
-
-   for (i = 0; i < mgpu_info.num_dgpu; i++) {
-   gpu_ins = &(mgpu_info.gpu_ins[i]);
-   adev = gpu_ins->adev;
-   if (!(adev->flags & AMD_IS_APU) &&
-   !gpu_ins->mgpu_fan_enabled &&
-   adev->powerplay.pp_funcs &&
-   adev->powerplay.pp_funcs->enable_mgpu_fan_boost) {
-   ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
-   if (ret)
-   break;
-
-   gpu_ins->mgpu_fan_enabled = 1;
-   }
-   }
-
-out:
-   mutex_unlock(&mgpu_info.mutex);
-
-   return ret;
-}
-
 /**
- * amdgpu_device_ip_late_init_func_handler - work handler for i