On Fri, Apr 3, 2026 at 5:49 AM Jesse Zhang <[email protected]> wrote:
>
> Hold MEC pipe reset asserted, walk every queue on that (me, pipe) and tear
> down CP_HQD_ACTIVE / CP_HQD_DEQUEUE_REQUEST via
> gfx_v11_0_clear_hqds_on_mec_pipe(),
> then deassert reset. Avoids releasing pipe reset while HQDs may still be
> active.
>
> Legacy (non-RS64) path: read CP_MEC_CNTL for the reset mask instead of
> reusing CP_MEC_RS64_CNTL state.
>
> Suggested-by: Manu Rastogi <[email protected]>
> Signed-off-by: Jesse Zhang <[email protected]>
> ---
> drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 148 +++++++++++++++----------
> 1 file changed, 91 insertions(+), 57 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> index ae39b9e1f7d6..18b92990179d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> @@ -6906,11 +6906,39 @@ static int gfx_v11_0_reset_kgq(struct amdgpu_ring
> *ring,
> return amdgpu_ring_reset_helper_end(ring, timedout_fence);
> }
>
> +/*
> + * With MEC pipe reset asserted, clear CP_HQD_ACTIVE /
> CP_HQD_DEQUEUE_REQUEST for
> + * every queue on (me, pipe). HQDs must be torn down while pipe reset stays
> + * asserted; only then clear the pipe reset bit.
> + * Caller must hold adev->srbm_mutex.
> + */
> +static void gfx_v11_0_clear_hqds_on_mec_pipe(struct amdgpu_device *adev, u32
> me,
> + u32 pipe)
> +{
> + unsigned int q;
> + int j;
> +
> + for (q = 0; q < adev->gfx.mec.num_queue_per_pipe; q++) {
> + soc21_grbm_select(adev, me, pipe, q, 0);
> + /* Start from a clean HQD dequeue state before forcing HQD
> inactive. */
> + WREG32_SOC15(GC, 0, regCP_HQD_ACTIVE, 0);
> + if (RREG32_SOC15(GC, 0, regCP_HQD_ACTIVE) & 1) {
> + WREG32_SOC15(GC, 0, regCP_HQD_DEQUEUE_REQUEST, 1);
> + for (j = 0; j < adev->usec_timeout; j++) {
> + if (!(RREG32_SOC15(GC, 0, regCP_HQD_ACTIVE) &
> 1))
> + break;
> + udelay(1);
> + }
> + }
> + WREG32_SOC15(GC, 0, regCP_HQD_DEQUEUE_REQUEST, 0);
> + }
> +}
Do we have a way to look up which queue was actually on the pipe?
This will lead to all queues on this pipe ultimately timing out the
next time they are used since we never re-enable the queues after the
reset. I think we need something like amdgpu_vcn_reset_engine() for
each compute or gfx pipe. We need to stop the schedulers for all
queues on the pipe and then mark the fences with an error and then
make sure to re-enable and test all of the queues after the reset.
Ideally, we could resume any queues that weren't on the pipe after the
reset non-destructively, but I'm not sure if that is possible or not.
If the pipe reset doesn't affect any queues which were not on the pipe
at the time, then we can just loop over all of the queues on the pipe,
and for the non-guilty ones (i.e., the ones not on the pipe), we can
save their entire unprocessed ring state and reemit it. For the
guilty one, we we already save it's non-guilty state and restore it
after the reset. We also need to handle KFD queues. I think we need
to preempt the KFD queues at the start of this function and then
restore them at the end to make sure they are properly handled as
well.
Alex
> +
> static int gfx_v11_0_reset_compute_pipe(struct amdgpu_ring *ring)
> {
>
> struct amdgpu_device *adev = ring->adev;
> - uint32_t reset_pipe = 0, clean_pipe = 0;
> + uint32_t reset_val, clean_val;
> int r;
>
> if (!gfx_v11_pipe_reset_support(adev))
> @@ -6920,69 +6948,73 @@ static int gfx_v11_0_reset_compute_pipe(struct
> amdgpu_ring *ring)
> mutex_lock(&adev->srbm_mutex);
> soc21_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
>
> - reset_pipe = RREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL);
> - clean_pipe = reset_pipe;
> -
> if (adev->gfx.rs64_enable) {
> + reset_val = RREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL);
> + clean_val = reset_val;
>
> switch (ring->pipe) {
> case 0:
> - reset_pipe = REG_SET_FIELD(reset_pipe,
> CP_MEC_RS64_CNTL,
> - MEC_PIPE0_RESET, 1);
> - clean_pipe = REG_SET_FIELD(clean_pipe,
> CP_MEC_RS64_CNTL,
> - MEC_PIPE0_RESET, 0);
> + reset_val = REG_SET_FIELD(reset_val, CP_MEC_RS64_CNTL,
> + MEC_PIPE0_RESET, 1);
> + clean_val = REG_SET_FIELD(clean_val, CP_MEC_RS64_CNTL,
> + MEC_PIPE0_RESET, 0);
> break;
> case 1:
> - reset_pipe = REG_SET_FIELD(reset_pipe,
> CP_MEC_RS64_CNTL,
> - MEC_PIPE1_RESET, 1);
> - clean_pipe = REG_SET_FIELD(clean_pipe,
> CP_MEC_RS64_CNTL,
> - MEC_PIPE1_RESET, 0);
> + reset_val = REG_SET_FIELD(reset_val, CP_MEC_RS64_CNTL,
> + MEC_PIPE1_RESET, 1);
> + clean_val = REG_SET_FIELD(clean_val, CP_MEC_RS64_CNTL,
> + MEC_PIPE1_RESET, 0);
> break;
> case 2:
> - reset_pipe = REG_SET_FIELD(reset_pipe,
> CP_MEC_RS64_CNTL,
> - MEC_PIPE2_RESET, 1);
> - clean_pipe = REG_SET_FIELD(clean_pipe,
> CP_MEC_RS64_CNTL,
> - MEC_PIPE2_RESET, 0);
> + reset_val = REG_SET_FIELD(reset_val, CP_MEC_RS64_CNTL,
> + MEC_PIPE2_RESET, 1);
> + clean_val = REG_SET_FIELD(clean_val, CP_MEC_RS64_CNTL,
> + MEC_PIPE2_RESET, 0);
> break;
> case 3:
> - reset_pipe = REG_SET_FIELD(reset_pipe,
> CP_MEC_RS64_CNTL,
> - MEC_PIPE3_RESET, 1);
> - clean_pipe = REG_SET_FIELD(clean_pipe,
> CP_MEC_RS64_CNTL,
> - MEC_PIPE3_RESET, 0);
> + reset_val = REG_SET_FIELD(reset_val, CP_MEC_RS64_CNTL,
> + MEC_PIPE3_RESET, 1);
> + clean_val = REG_SET_FIELD(clean_val, CP_MEC_RS64_CNTL,
> + MEC_PIPE3_RESET, 0);
> break;
> default:
> break;
> }
> - WREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL, reset_pipe);
> - WREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL, clean_pipe);
> + WREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL, reset_val);
> + gfx_v11_0_clear_hqds_on_mec_pipe(adev, ring->me, ring->pipe);
> + soc21_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
> + WREG32_SOC15(GC, 0, regCP_MEC_RS64_CNTL, clean_val);
> r = (RREG32_SOC15(GC, 0, regCP_MEC_RS64_INSTR_PNTR) << 2) -
> RS64_FW_UC_START_ADDR_LO;
> } else {
> + reset_val = RREG32_SOC15(GC, 0, regCP_MEC_CNTL);
> + clean_val = reset_val;
> +
> if (ring->me == 1) {
> switch (ring->pipe) {
> case 0:
> - reset_pipe = REG_SET_FIELD(reset_pipe,
> CP_MEC_CNTL,
> -
> MEC_ME1_PIPE0_RESET, 1);
> - clean_pipe = REG_SET_FIELD(clean_pipe,
> CP_MEC_CNTL,
> -
> MEC_ME1_PIPE0_RESET, 0);
> + reset_val = REG_SET_FIELD(reset_val,
> CP_MEC_CNTL,
> +
> MEC_ME1_PIPE0_RESET, 1);
> + clean_val = REG_SET_FIELD(clean_val,
> CP_MEC_CNTL,
> +
> MEC_ME1_PIPE0_RESET, 0);
> break;
> case 1:
> - reset_pipe = REG_SET_FIELD(reset_pipe,
> CP_MEC_CNTL,
> -
> MEC_ME1_PIPE1_RESET, 1);
> - clean_pipe = REG_SET_FIELD(clean_pipe,
> CP_MEC_CNTL,
> -
> MEC_ME1_PIPE1_RESET, 0);
> + reset_val = REG_SET_FIELD(reset_val,
> CP_MEC_CNTL,
> +
> MEC_ME1_PIPE1_RESET, 1);
> + clean_val = REG_SET_FIELD(clean_val,
> CP_MEC_CNTL,
> +
> MEC_ME1_PIPE1_RESET, 0);
> break;
> case 2:
> - reset_pipe = REG_SET_FIELD(reset_pipe,
> CP_MEC_CNTL,
> -
> MEC_ME1_PIPE2_RESET, 1);
> - clean_pipe = REG_SET_FIELD(clean_pipe,
> CP_MEC_CNTL,
> -
> MEC_ME1_PIPE2_RESET, 0);
> + reset_val = REG_SET_FIELD(reset_val,
> CP_MEC_CNTL,
> +
> MEC_ME1_PIPE2_RESET, 1);
> + clean_val = REG_SET_FIELD(clean_val,
> CP_MEC_CNTL,
> +
> MEC_ME1_PIPE2_RESET, 0);
> break;
> case 3:
> - reset_pipe = REG_SET_FIELD(reset_pipe,
> CP_MEC_CNTL,
> -
> MEC_ME1_PIPE3_RESET, 1);
> - clean_pipe = REG_SET_FIELD(clean_pipe,
> CP_MEC_CNTL,
> -
> MEC_ME1_PIPE3_RESET, 0);
> + reset_val = REG_SET_FIELD(reset_val,
> CP_MEC_CNTL,
> +
> MEC_ME1_PIPE3_RESET, 1);
> + clean_val = REG_SET_FIELD(clean_val,
> CP_MEC_CNTL,
> +
> MEC_ME1_PIPE3_RESET, 0);
> break;
> default:
> break;
> @@ -6991,36 +7023,38 @@ static int gfx_v11_0_reset_compute_pipe(struct
> amdgpu_ring *ring)
> } else {
> switch (ring->pipe) {
> case 0:
> - reset_pipe = REG_SET_FIELD(reset_pipe,
> CP_MEC_CNTL,
> -
> MEC_ME2_PIPE0_RESET, 1);
> - clean_pipe = REG_SET_FIELD(clean_pipe,
> CP_MEC_CNTL,
> -
> MEC_ME2_PIPE0_RESET, 0);
> + reset_val = REG_SET_FIELD(reset_val,
> CP_MEC_CNTL,
> +
> MEC_ME2_PIPE0_RESET, 1);
> + clean_val = REG_SET_FIELD(clean_val,
> CP_MEC_CNTL,
> +
> MEC_ME2_PIPE0_RESET, 0);
> break;
> case 1:
> - reset_pipe = REG_SET_FIELD(reset_pipe,
> CP_MEC_CNTL,
> -
> MEC_ME2_PIPE1_RESET, 1);
> - clean_pipe = REG_SET_FIELD(clean_pipe,
> CP_MEC_CNTL,
> -
> MEC_ME2_PIPE1_RESET, 0);
> + reset_val = REG_SET_FIELD(reset_val,
> CP_MEC_CNTL,
> +
> MEC_ME2_PIPE1_RESET, 1);
> + clean_val = REG_SET_FIELD(clean_val,
> CP_MEC_CNTL,
> +
> MEC_ME2_PIPE1_RESET, 0);
> break;
> case 2:
> - reset_pipe = REG_SET_FIELD(reset_pipe,
> CP_MEC_CNTL,
> -
> MEC_ME2_PIPE2_RESET, 1);
> - clean_pipe = REG_SET_FIELD(clean_pipe,
> CP_MEC_CNTL,
> -
> MEC_ME2_PIPE2_RESET, 0);
> + reset_val = REG_SET_FIELD(reset_val,
> CP_MEC_CNTL,
> +
> MEC_ME2_PIPE2_RESET, 1);
> + clean_val = REG_SET_FIELD(clean_val,
> CP_MEC_CNTL,
> +
> MEC_ME2_PIPE2_RESET, 0);
> break;
> case 3:
> - reset_pipe = REG_SET_FIELD(reset_pipe,
> CP_MEC_CNTL,
> -
> MEC_ME2_PIPE3_RESET, 1);
> - clean_pipe = REG_SET_FIELD(clean_pipe,
> CP_MEC_CNTL,
> -
> MEC_ME2_PIPE3_RESET, 0);
> + reset_val = REG_SET_FIELD(reset_val,
> CP_MEC_CNTL,
> +
> MEC_ME2_PIPE3_RESET, 1);
> + clean_val = REG_SET_FIELD(clean_val,
> CP_MEC_CNTL,
> +
> MEC_ME2_PIPE3_RESET, 0);
> break;
> default:
> break;
> }
> /* mec2 fw pc: CP:CP_MEC2_INSTR_PNTR */
> }
> - WREG32_SOC15(GC, 0, regCP_MEC_CNTL, reset_pipe);
> - WREG32_SOC15(GC, 0, regCP_MEC_CNTL, clean_pipe);
> + WREG32_SOC15(GC, 0, regCP_MEC_CNTL, reset_val);
> + gfx_v11_0_clear_hqds_on_mec_pipe(adev, ring->me, ring->pipe);
> + soc21_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
> + WREG32_SOC15(GC, 0, regCP_MEC_CNTL, clean_val);
> r = RREG32(SOC15_REG_OFFSET(GC, 0, regCP_MEC1_INSTR_PNTR));
> }
>
> @@ -7028,7 +7062,7 @@ static int gfx_v11_0_reset_compute_pipe(struct
> amdgpu_ring *ring)
> mutex_unlock(&adev->srbm_mutex);
> gfx_v11_0_unset_safe_mode(adev, 0);
>
> - dev_info(adev->dev, "The ring %s pipe resets to MEC FW start PC:
> %s\n", ring->name,
> + dev_dbg(adev->dev, "The ring %s pipe resets to MEC FW start PC:
> %s\n", ring->name,
> r == 0 ? "successfully" : "failed");
> /*FIXME:Sometimes driver can't cache the MEC firmware start PC
> correctly, so the pipe
> * reset status relies on the compute ring test result.
> --
> 2.49.0
>