On 8/29/2024 9:17 AM, Prike Liang wrote:
> Implement the compute pipe reset, and the driver will
> fallback to pipe reset when queue reset fails.
> The pipe reset only deactivates the queue which is
> scheduled in the pipe, and meanwhile the MEC engine
> will be reset to the firmware _start pointer. So,

May refine this to indicate that reset to _start is for the specific
pipe and not applicable for the whole MEC engine.

> it seems pipe reset will cost more cycles than the
> queue reset; therefore, the driver tries to recover
> by doing queue reset first.
> 
> Signed-off-by: Prike Liang <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 127 ++++++++++++++++++++----
>  1 file changed, 108 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> index 2067f26d3a9d..26ae62d2a752 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> @@ -3466,6 +3466,98 @@ static void gfx_v9_4_3_emit_wave_limit(struct 
> amdgpu_ring *ring, bool enable)
>       }
>  }
>  
> +static int gfx_v9_4_3_unmap_done(struct amdgpu_device *adev, uint32_t me,
> +                             uint32_t pipe, uint32_t queue,
> +                             uint32_t xcc_id)
> +{
> +     int i, r;
> +     /* make sure dequeue is complete*/
> +     gfx_v9_4_3_xcc_set_safe_mode(adev, xcc_id);
> +     mutex_lock(&adev->srbm_mutex);
> +     soc15_grbm_select(adev, me, pipe, queue, 0, GET_INST(GC, xcc_id));
> +     for (i = 0; i < adev->usec_timeout; i++) {
> +             if (!(RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_ACTIVE) 
> & 1))
> +                     break;
> +             udelay(1);
> +     }
> +     if (i >= adev->usec_timeout)
> +             r = -ETIMEDOUT;
> +     else
> +             r = 0;
> +     soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id));
> +     mutex_unlock(&adev->srbm_mutex);
> +     gfx_v9_4_3_xcc_unset_safe_mode(adev, xcc_id);
> +
> +     return r;
> +
> +}
> +
> +static bool gfx_v9_4_3_pipe_reset_support(struct amdgpu_device *adev)
> +{
> +     /*TODO: Need check gfx9.4.4 mec fw whether supports pipe reset as 
> well.*/
> +     if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) &&
> +                     adev->gfx.mec_fw_version >= 0x0000009b)
> +             return true;
> +     else
> +             dev_warn_once(adev->dev, "Please use the latest MEC version to 
> see whether support pipe reset\n");
> +
> +     return false;
> +}
> +
> +static int gfx_v9_4_3_kiq_reset_hw_pipe(struct amdgpu_ring *ring)

Please drop the kiq name in this function to avoid confusion. It's not
restricted to kiq.

With those

        Reviewed-by: Lijo Lazar <[email protected]>

Thanks,
Lijo

> +{
> +     struct amdgpu_device *adev = ring->adev;
> +     uint32_t reset_pipe, clean_pipe;
> +     int r;
> +
> +     if (!gfx_v9_4_3_pipe_reset_support(adev))
> +             return -EINVAL;
> +
> +     gfx_v9_4_3_xcc_set_safe_mode(adev, ring->xcc_id);
> +     mutex_lock(&adev->srbm_mutex);
> +
> +     reset_pipe = RREG32_SOC15(GC, GET_INST(GC, ring->xcc_id), 
> regCP_MEC_CNTL);
> +     clean_pipe = reset_pipe;
> +
> +     if (ring->me == 1) {
> +             switch (ring->pipe) {
> +             case 0:
> +                     reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
> +                                                MEC_ME1_PIPE0_RESET, 1);
> +                     break;
> +             case 1:
> +                     reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
> +                                                MEC_ME1_PIPE1_RESET, 1);
> +                     break;
> +             case 2:
> +                     reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
> +                                                MEC_ME1_PIPE2_RESET, 1);
> +                     break;
> +             case 3:
> +                     reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
> +                                                MEC_ME1_PIPE3_RESET, 1);
> +                     break;
> +             default:
> +                     break;
> +             }
> +     } else {
> +             if (ring->pipe)
> +                     reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
> +                                                MEC_ME2_PIPE1_RESET, 1);
> +             else
> +                     reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
> +                                                MEC_ME2_PIPE0_RESET, 1);
> +     }
> +
> +     WREG32_SOC15(GC, GET_INST(GC, ring->xcc_id), regCP_MEC_CNTL, 
> reset_pipe);
> +     WREG32_SOC15(GC, GET_INST(GC, ring->xcc_id), regCP_MEC_CNTL, 
> clean_pipe);
> +     mutex_unlock(&adev->srbm_mutex);
> +     gfx_v9_4_3_xcc_unset_safe_mode(adev, ring->xcc_id);
> +
> +     r = gfx_v9_4_3_unmap_done(adev, ring->me, ring->pipe, ring->queue, 
> ring->xcc_id);
> +     return r;
> +}
> +
>  static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring *ring,
>                               unsigned int vmid)
>  {
> @@ -3473,7 +3565,7 @@ static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring 
> *ring,
>       struct amdgpu_kiq *kiq = &adev->gfx.kiq[ring->xcc_id];
>       struct amdgpu_ring *kiq_ring = &kiq->ring;
>       unsigned long flags;
> -     int r, i;
> +     int r;
>  
>       if (amdgpu_sriov_vf(adev))
>               return -EINVAL;
> @@ -3495,26 +3587,23 @@ static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring 
> *ring,
>       spin_unlock_irqrestore(&kiq->ring_lock, flags);
>  
>       r = amdgpu_ring_test_ring(kiq_ring);
> +     if (r) {
> +             dev_err(adev->dev, "kiq ring test failed after ring: %s queue 
> reset\n",
> +                             ring->name);
> +             goto pipe_reset;
> +     }
> +
> +     r = gfx_v9_4_3_unmap_done(adev, ring->me, ring->pipe, ring->queue, 
> ring->xcc_id);
>       if (r)
> -             return r;
> +             dev_err(adev->dev, "fail to wait on hqd deactive and will try 
> pipe reset\n");
>  
> -     /* make sure dequeue is complete*/
> -     amdgpu_gfx_rlc_enter_safe_mode(adev, ring->xcc_id);
> -     mutex_lock(&adev->srbm_mutex);
> -     soc15_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0, 
> GET_INST(GC, ring->xcc_id));
> -     for (i = 0; i < adev->usec_timeout; i++) {
> -             if (!(RREG32_SOC15(GC, 0, regCP_HQD_ACTIVE) & 1))
> -                     break;
> -             udelay(1);
> -     }
> -     if (i >= adev->usec_timeout)
> -             r = -ETIMEDOUT;
> -     soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, ring->xcc_id));
> -     mutex_unlock(&adev->srbm_mutex);
> -     amdgpu_gfx_rlc_exit_safe_mode(adev, ring->xcc_id);
> -     if (r) {
> -             dev_err(adev->dev, "fail to wait on hqd deactive\n");
> -             return r;
> +pipe_reset:
> +     if(r) {
> +             r = gfx_v9_4_3_kiq_reset_hw_pipe(ring);
> +             dev_info(adev->dev, "ring: %s pipe reset :%s\n", ring->name,
> +                             r ? "failed" : "successfully");
> +             if (r)
> +                     return r;
>       }
>  
>       r = amdgpu_bo_reserve(ring->mqd_obj, false);

Reply via email to