On 2025. december 18., csütörtök 16:41:40 középső államokbeli zónaidő Alex
Deucher wrote:
> Replace WAIT_REG_MEM with EVENT_WRITE flushes for all
> shader types and PFP_SYNC_ME. That should accomplish
> the same thing and avoid having to wait on a fence
> preventing any issues with pipeline syncs during
> queue resets.
>
> Signed-off-by: Alex Deucher <[email protected]>
> ---
> drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 32 ++++++++++++++++++---------
> 1 file changed, 21 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 7b012ca1153ea..d9dee3c11a05d
> 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -5572,15 +5572,26 @@ static void gfx_v9_0_ring_emit_fence(struct
> amdgpu_ring *ring, u64 addr, amdgpu_ring_write(ring, 0);
> }
>
> -static void gfx_v9_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
> +static void gfx_v9_0_ring_emit_event_write(struct amdgpu_ring *ring,
> + uint32_t event_type,
> + uint32_t
event_index)
> {
> - int usepfp = (ring->funcs->type == AMDGPU_RING_TYPE_GFX);
> - uint32_t seq = ring->fence_drv.sync_seq;
> - uint64_t addr = ring->fence_drv.gpu_addr;
> + amdgpu_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE, 0));
> + amdgpu_ring_write(ring, EVENT_TYPE(event_type) |
> + EVENT_INDEX(event_index));
> +}
>
> - gfx_v9_0_wait_reg_mem(ring, usepfp, 1, 0,
> - lower_32_bits(addr),
upper_32_bits(addr),
> - seq, 0xffffffff, 4);
> +static void gfx_v9_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
> +{
> + if (ring->funcs->type == AMDGPU_RING_TYPE_GFX) {
> + gfx_v9_0_ring_emit_event_write(ring, VS_PARTIAL_FLUSH,
4);
Is VS_PARTIAL_FLUSH necessary when we already have PS_PARTIAL_FLUSH?
When we wait for all PS to finish, wouldn't that imply that all VS had already
finished as well?
> + gfx_v9_0_ring_emit_event_write(ring, PS_PARTIAL_FLUSH,
4);
> + gfx_v9_0_ring_emit_event_write(ring, CS_PARTIAL_FLUSH,
4);
> + amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME,
0));
> + amdgpu_ring_write(ring, 0x0);
The above sequence just waits for all shaders to finish, but as far as I
understand it doesn't wait for memory writes and cache flushes. Please correct
me if I'm wrong about this. For that, I think we do need an ACQUIRE_MEM
packet. (And, if the ACQUIRE_MEM is done on the PFP then we won't need the
PFP_SYNC_ME.)
> + } else {
> + gfx_v9_0_ring_emit_event_write(ring, CS_PARTIAL_FLUSH,
4);
> + }
> }
>
> static void gfx_v9_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
> @@ -7404,7 +7415,7 @@ static const struct amdgpu_ring_funcs
> gfx_v9_0_ring_funcs_gfx = { .set_wptr = gfx_v9_0_ring_set_wptr_gfx,
> .emit_frame_size = /* totally 242 maximum if 16 IBs */
> 5 + /* COND_EXEC */
> - 7 + /* PIPELINE_SYNC */
> + 8 + /* PIPELINE_SYNC */
> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> 2 + /* VM_FLUSH */
> @@ -7460,7 +7471,7 @@ static const struct amdgpu_ring_funcs
> gfx_v9_0_sw_ring_funcs_gfx = { .set_wptr = amdgpu_sw_ring_set_wptr_gfx,
> .emit_frame_size = /* totally 242 maximum if 16 IBs */
> 5 + /* COND_EXEC */
> - 7 + /* PIPELINE_SYNC */
> + 8 + /* PIPELINE_SYNC */
> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> 2 + /* VM_FLUSH */
> @@ -7521,7 +7532,7 @@ static const struct amdgpu_ring_funcs
> gfx_v9_0_ring_funcs_compute = { 20 + /* gfx_v9_0_ring_emit_gds_switch */
> 7 + /* gfx_v9_0_ring_emit_hdp_flush */
> 5 + /* hdp invalidate */
> - 7 + /* gfx_v9_0_ring_emit_pipeline_sync */
> + 2 + /* gfx_v9_0_ring_emit_pipeline_sync */
> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> 8 + 8 + 8 + /* gfx_v9_0_ring_emit_fence x3 for user
fence, vm fence */
> @@ -7564,7 +7575,6 @@ static const struct amdgpu_ring_funcs
> gfx_v9_0_ring_funcs_kiq = { 20 + /* gfx_v9_0_ring_emit_gds_switch */
> 7 + /* gfx_v9_0_ring_emit_hdp_flush */
> 5 + /* hdp invalidate */
> - 7 + /* gfx_v9_0_ring_emit_pipeline_sync */
> SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence_kiq x3 for user
fence, vm fence */