Re: [PATCH] drm/amdgpu: Prefer RAS recovery for scheduler hang

Lazar, Lijo Tue, 19 Nov 2024 23:03:04 -0800

<Ping>


On 11/18/2024 11:14 AM, Lijo Lazar wrote:
> Before scheduling a recovery due to scheduler/job hang, check if a RAS
> error is detected. If so, choose RAS recovery to handle the situation. A
> scheduler/job hang could be the side effect of a RAS error. In such
> cases, it is required to go through the RAS error recovery process. A
> RAS error recovery process in certains cases also could avoid a full
> device device reset.
> 
> An error state is maintained in RAS context to detect the block
> affected. Fatal Error state uses unused block id. Set the block id when
> error is detected. If the interrupt handler detected a poison error,
> it's not required to look for a fatal error. Skip fatal error checking
> in such cases.
> 
> Signed-off-by: Lijo Lazar <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdgpu/aldebaran.c        |  2 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    | 15 ++++-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c       | 55 ++++++++++++++++++-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h       | 11 +++-
>  .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   |  2 +
>  5 files changed, 78 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/aldebaran.c 
> b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> index 3a588fecb0c5..6a2fd9e4f470 100644
> --- a/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> +++ b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> @@ -332,6 +332,8 @@ aldebaran_mode2_restore_hwcontext(struct 
> amdgpu_reset_control *reset_ctl,
>       list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
>               dev_info(tmp_adev->dev,
>                        "GPU reset succeeded, trying to resume\n");
> +             /*TBD: Ideally should clear only GFX, SDMA blocks*/
> +             amdgpu_ras_clear_err_state(tmp_adev);
>               r = aldebaran_mode2_restore_ip(tmp_adev);
>               if (r)
>                       goto end;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index b3ca911e55d6..b4464f42d72a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5165,7 +5165,7 @@ static int amdgpu_device_reset_sriov(struct 
> amdgpu_device *adev,
>       if (r)
>               return r;
>  
> -     amdgpu_ras_set_fed(adev, false);
> +     amdgpu_ras_clear_err_state(adev);
>       amdgpu_irq_gpu_reset_resume_helper(adev);
>  
>       /* some sw clean up VF needs to do before recover */
> @@ -5460,7 +5460,7 @@ int amdgpu_device_reinit_after_reset(struct 
> amdgpu_reset_context *reset_context)
>               amdgpu_set_init_level(tmp_adev, AMDGPU_INIT_LEVEL_DEFAULT);
>               if (full_reset) {
>                       /* post card */
> -                     amdgpu_ras_set_fed(tmp_adev, false);
> +                     amdgpu_ras_clear_err_state(tmp_adev);
>                       r = amdgpu_device_asic_init(tmp_adev);
>                       if (r) {
>                               dev_warn(tmp_adev->dev, "asic atom init 
> failed!");
> @@ -5786,6 +5786,17 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
> *adev,
>       bool audio_suspended = false;
>       int retry_limit = AMDGPU_MAX_RETRY_LIMIT;
>  
> +     /*
> +      * If it reaches here because of hang/timeout and a RAS error is
> +      * detected at the same time, let RAS recovery take care of it.
> +      */
> +     if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) &&
> +         reset_context->src != AMDGPU_RESET_SRC_RAS) {
> +             dev_dbg(adev->dev,
> +                     "Gpu recovery from source: %d yielding to RAS error 
> recovery handling",
> +                     reset_context->src);
> +             return 0;
> +     }
>       /*
>        * Special case: RAS triggered and full reset isn't supported
>        */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 1bc95b0cdbb8..e31b12144577 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2156,6 +2156,16 @@ void amdgpu_ras_interrupt_fatal_error_handler(struct 
> amdgpu_device *adev)
>       /* Fatal error events are handled on host side */
>       if (amdgpu_sriov_vf(adev))
>               return;
> +     /**
> +      * If the current interrupt is caused by a non-fatal RAS error, skip
> +      * check for fatal error. For fatal errors, FED status of all devices
> +      * in XGMI hive gets set when the first device gets fatal error
> +      * interrupt. The error gets propagated to other devices as well, so
> +      * make sure to ack the interrupt regardless of FED status.
> +      */
> +     if (!amdgpu_ras_get_fed_status(adev) &&
> +         amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY))
> +             return;
>  
>       if (adev->nbio.ras &&
>           adev->nbio.ras->handle_ras_controller_intr_no_bifring)
> @@ -2185,6 +2195,7 @@ static void 
> amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
>       if (ret)
>               return;
>  
> +     amdgpu_ras_set_err_poison(adev, block_obj->ras_comm.block);
>       /* both query_poison_status and handle_poison_consumption are optional,
>        * but at least one of them should be implemented if we need poison
>        * consumption handler
> @@ -4083,16 +4094,56 @@ bool amdgpu_ras_get_fed_status(struct amdgpu_device 
> *adev)
>       if (!ras)
>               return false;
>  
> -     return atomic_read(&ras->fed);
> +     return test_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state);
>  }
>  
>  void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status)
>  {
>       struct amdgpu_ras *ras;
>  
> +     ras = amdgpu_ras_get_context(adev);
> +     if (ras) {
> +             if (status)
> +                     set_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state);
> +             else
> +                     clear_bit(AMDGPU_RAS_BLOCK__LAST, &ras->ras_err_state);
> +     }
> +}
> +
> +void amdgpu_ras_clear_err_state(struct amdgpu_device *adev)
> +{
> +     struct amdgpu_ras *ras;
> +
>       ras = amdgpu_ras_get_context(adev);
>       if (ras)
> -             atomic_set(&ras->fed, !!status);
> +             ras->ras_err_state = 0;
> +}
> +
> +void amdgpu_ras_set_err_poison(struct amdgpu_device *adev,
> +                            enum amdgpu_ras_block block)
> +{
> +     struct amdgpu_ras *ras;
> +
> +     ras = amdgpu_ras_get_context(adev);
> +     if (ras)
> +             set_bit(block, &ras->ras_err_state);
> +}
> +
> +bool amdgpu_ras_is_err_state(struct amdgpu_device *adev, int block)
> +{
> +     struct amdgpu_ras *ras;
> +
> +     ras = amdgpu_ras_get_context(adev);
> +     if (ras) {
> +             if (block == AMDGPU_RAS_BLOCK__ANY)
> +                     return (ras->ras_err_state != 0);
> +             else
> +                     return test_bit(block, &ras->ras_err_state) ||
> +                            test_bit(AMDGPU_RAS_BLOCK__LAST,
> +                                     &ras->ras_err_state);
> +     }
> +
> +     return false;
>  }
>  
>  static struct ras_event_manager *__get_ras_event_mgr(struct amdgpu_device 
> *adev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 6db772ecfee4..b13debcf48ee 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -99,7 +99,8 @@ enum amdgpu_ras_block {
>       AMDGPU_RAS_BLOCK__IH,
>       AMDGPU_RAS_BLOCK__MPIO,
>  
> -     AMDGPU_RAS_BLOCK__LAST
> +     AMDGPU_RAS_BLOCK__LAST,
> +     AMDGPU_RAS_BLOCK__ANY = -1
>  };
>  
>  enum amdgpu_ras_mca_block {
> @@ -558,8 +559,8 @@ struct amdgpu_ras {
>       struct ras_ecc_log_info  umc_ecc_log;
>       struct delayed_work page_retirement_dwork;
>  
> -     /* Fatal error detected flag */
> -     atomic_t fed;
> +     /* ras errors detected */
> +     unsigned long ras_err_state;
>  
>       /* RAS event manager */
>       struct ras_event_manager __event_mgr;
> @@ -952,6 +953,10 @@ ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, 
> struct device_attribute *a
>  
>  void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status);
>  bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev);
> +void amdgpu_ras_set_err_poison(struct amdgpu_device *adev,
> +                            enum amdgpu_ras_block block);
> +void amdgpu_ras_clear_err_state(struct amdgpu_device *adev);
> +bool amdgpu_ras_is_err_state(struct amdgpu_device *adev, int block);
>  
>  u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum 
> ras_event_type type);
>  int amdgpu_ras_mark_ras_event_caller(struct amdgpu_device *adev, enum 
> ras_event_type type,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> index d46a13156ee9..0cb5c582ce7d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> @@ -184,6 +184,7 @@ static void event_interrupt_poison_consumption_v9(struct 
> kfd_node *dev,
>               } else {
>                       reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
>               }
> +             amdgpu_ras_set_err_poison(dev->adev, AMDGPU_RAS_BLOCK__GFX);
>               break;
>       case SOC15_IH_CLIENTID_VMC:
>       case SOC15_IH_CLIENTID_VMC1:
> @@ -213,6 +214,7 @@ static void event_interrupt_poison_consumption_v9(struct 
> kfd_node *dev,
>               } else {
>                       reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
>               }
> +             amdgpu_ras_set_err_poison(dev->adev, AMDGPU_RAS_BLOCK__SDMA);
>               break;
>       default:
>               dev_warn(dev->adev->dev,

Re: [PATCH] drm/amdgpu: Prefer RAS recovery for scheduler hang

Reply via email to