On 8/8/2024 10:56 PM, Victor Skvortsov wrote:
> VFs do not perform HW fini/suspend in FLR, so the dpm_enabled
> is incorrectly kept enabled. Add interface to disable it in
> virt_pre_reset call.
> 
> Signed-off-by: Victor Skvortsov <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    | 10 +++++----
>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c      |  8 +++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h      |  1 +
>  .../gpu/drm/amd/include/kgd_pp_interface.h    |  1 +
>  .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  | 21 +++++++++++++++++++
>  5 files changed, 37 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 730dae77570c..1be5699f4190 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5288,10 +5288,8 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device 
> *adev,
>       if (reset_context->reset_req_dev == adev)
>               job = reset_context->job;
>  
> -     if (amdgpu_sriov_vf(adev)) {
> -             /* stop the data exchange thread */
> -             amdgpu_virt_fini_data_exchange(adev);
> -     }
> +     if (amdgpu_sriov_vf(adev))
> +             amdgpu_virt_pre_reset(adev);
>  
>       amdgpu_fence_driver_isr_toggle(adev, true);
>  
> @@ -5561,6 +5559,10 @@ int amdgpu_do_asic_reset(struct list_head 
> *device_list_handle,
>  
>  static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
>  {
> +     if (amdgpu_sriov_vf(adev)) {
> +             adev->mp1_state = PP_MP1_STATE_FLR;
> +             return;
> +     }

Better remove this change. If at all this state need to be persisted
through out the reset, handle it only through amdgpu_dpm_set_mp1_state.
For now, I don't see a reason to store this state, we only need this
state as a trigger for the action associated with this.
>  
>       switch (amdgpu_asic_reset_method(adev)) {
>       case AMD_RESET_METHOD_MODE1:
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index 111c380f929b..456a685c3975 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -33,6 +33,7 @@
>  #include "amdgpu.h"
>  #include "amdgpu_ras.h"
>  #include "amdgpu_reset.h"
> +#include "amdgpu_dpm.h"
>  #include "vi.h"
>  #include "soc15.h"
>  #include "nv.h"
> @@ -849,6 +850,13 @@ enum amdgpu_sriov_vf_mode 
> amdgpu_virt_get_sriov_vf_mode(struct amdgpu_device *ad
>       return mode;
>  }
>  
> +void amdgpu_virt_pre_reset(struct amdgpu_device *adev)
> +{
> +     /* stop the data exchange thread */
> +     amdgpu_virt_fini_data_exchange(adev);
> +     amdgpu_dpm_set_mp1_state(adev, PP_MP1_STATE_FLR);
> +}
> +
>  void amdgpu_virt_post_reset(struct amdgpu_device *adev)
>  {
>       if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index b42a8854dca0..b650a2032c42 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -376,6 +376,7 @@ u32 amdgpu_sriov_rreg(struct amdgpu_device *adev,
>                     u32 offset, u32 acc_flags, u32 hwip, u32 xcc_id);
>  bool amdgpu_virt_fw_load_skip_check(struct amdgpu_device *adev,
>                       uint32_t ucode_id);
> +void amdgpu_virt_pre_reset(struct amdgpu_device *adev);
>  void amdgpu_virt_post_reset(struct amdgpu_device *adev);
>  bool amdgpu_sriov_xnack_support(struct amdgpu_device *adev);
>  bool amdgpu_virt_get_rlcg_reg_access_flag(struct amdgpu_device *adev,
> diff --git a/drivers/gpu/drm/amd/include/kgd_pp_interface.h 
> b/drivers/gpu/drm/amd/include/kgd_pp_interface.h
> index 4b20e2274313..19a48d98830a 100644
> --- a/drivers/gpu/drm/amd/include/kgd_pp_interface.h
> +++ b/drivers/gpu/drm/amd/include/kgd_pp_interface.h
> @@ -218,6 +218,7 @@ enum pp_mp1_state {
>       PP_MP1_STATE_SHUTDOWN,
>       PP_MP1_STATE_UNLOAD,
>       PP_MP1_STATE_RESET,
> +     PP_MP1_STATE_FLR,
>  };
>  
>  enum pp_df_cstate {
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
> b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
> index 78c3f94bb3ff..b85478b1eaa7 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
> @@ -2638,6 +2638,26 @@ static int smu_v13_0_6_send_rma_reason(struct 
> smu_context *smu)
>       return ret;
>  }
>  
> +static int smu_v13_0_6_set_mp1_state(struct smu_context *smu,
> +                             enum pp_mp1_state mp1_state)
> +{
> +     int ret =0;
> +
> +     switch (mp1_state) {
> +     case PP_MP1_STATE_FLR:
> +             /* VF lost access to SMU */
> +             smu->adev->pm.dpm_enabled = false;
> +             ret = 0;
> +             break;
> +     default:
> +             /* Ignore others */
> +             ret = 0;
> +     }
> +
> +     return ret;
> +}

Should this be made generic for all SOCs - i.e., handle it
amdgpu_dpm_set_mp1_state() itself? The logic remains the same - turn off
dpm flag during VF reset.

Thanks,
Lijo

> +
> +
>  static int mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable)
>  {
>       struct smu_context *smu = adev->powerplay.pp_handle;
> @@ -3283,6 +3303,7 @@ static const struct pptable_funcs smu_v13_0_6_ppt_funcs 
> = {
>       .i2c_fini = smu_v13_0_6_i2c_control_fini,
>       .send_hbm_bad_pages_num = smu_v13_0_6_smu_send_hbm_bad_page_num,
>       .send_rma_reason = smu_v13_0_6_send_rma_reason,
> +     .set_mp1_state = smu_v13_0_6_set_mp1_state,
>  };
>  
>  void smu_v13_0_6_set_ppt_funcs(struct smu_context *smu)

Reply via email to