Re: [Mesa-dev] [PATCH 15/17] radeonsi: always prefetch later shaders after the draw packet
Reviewed-by: Samuel PitoisetOn 04/05/2018 02:33 AM, Marek Olšák wrote: From: Marek Olšák so that the draw is started as soon as possible. v2: only prefetch the API VS and VBO descriptors --- src/gallium/drivers/radeonsi/si_cp_dma.c | 89 +--- src/gallium/drivers/radeonsi/si_pipe.h | 2 +- src/gallium/drivers/radeonsi/si_state_draw.c | 10 +++- 3 files changed, 75 insertions(+), 26 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index 15bd305a350..1e26774ffee 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -514,80 +514,123 @@ static void cik_prefetch_shader_async(struct si_context *sctx, static void cik_prefetch_VBO_descriptors(struct si_context *sctx) { if (!sctx->vertex_elements) return; cik_prefetch_TC_L2_async(sctx, >vb_descriptors_buffer->b.b, sctx->vb_descriptors_offset, sctx->vertex_elements->desc_list_byte_size); } -void cik_emit_prefetch_L2(struct si_context *sctx) +/** + * Prefetch shaders and VBO descriptors. + * + * \param vertex_stage_only Whether only the the API VS and VBO descriptors + * should be prefetched. + */ +void cik_emit_prefetch_L2(struct si_context *sctx, bool vertex_stage_only) { + unsigned mask = sctx->prefetch_L2_mask; + assert(mask); + /* Prefetch shaders and VBO descriptors to TC L2. */ if (sctx->b.chip_class >= GFX9) { /* Choose the right spot for the VBO prefetch. */ if (sctx->tes_shader.cso) { - if (sctx->prefetch_L2_mask & SI_PREFETCH_HS) + if (mask & SI_PREFETCH_HS) cik_prefetch_shader_async(sctx, sctx->queued.named.hs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) cik_prefetch_VBO_descriptors(sctx); - if (sctx->prefetch_L2_mask & SI_PREFETCH_GS) + if (vertex_stage_only) { + sctx->prefetch_L2_mask &= ~(SI_PREFETCH_HS | + SI_PREFETCH_VBO_DESCRIPTORS); + return; + } + + if (mask & SI_PREFETCH_GS) cik_prefetch_shader_async(sctx, sctx->queued.named.gs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) + if (mask & SI_PREFETCH_VS) cik_prefetch_shader_async(sctx, sctx->queued.named.vs); } else if (sctx->gs_shader.cso) { - if (sctx->prefetch_L2_mask & SI_PREFETCH_GS) + if (mask & SI_PREFETCH_GS) cik_prefetch_shader_async(sctx, sctx->queued.named.gs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) cik_prefetch_VBO_descriptors(sctx); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) + if (vertex_stage_only) { + sctx->prefetch_L2_mask &= ~(SI_PREFETCH_GS | + SI_PREFETCH_VBO_DESCRIPTORS); + return; + } + + if (mask & SI_PREFETCH_VS) cik_prefetch_shader_async(sctx, sctx->queued.named.vs); } else { - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) + if (mask & SI_PREFETCH_VS) cik_prefetch_shader_async(sctx, sctx->queued.named.vs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) cik_prefetch_VBO_descriptors(sctx); + if (vertex_stage_only) { + sctx->prefetch_L2_mask &= ~(SI_PREFETCH_VS | + SI_PREFETCH_VBO_DESCRIPTORS); + return; + } } } else { /* SI-CI-VI */ /* Choose the right spot for the VBO prefetch. */ if (sctx->tes_shader.cso) { - if (sctx->prefetch_L2_mask & SI_PREFETCH_LS) + if (mask & SI_PREFETCH_LS) cik_prefetch_shader_async(sctx, sctx->queued.named.ls); -
[Mesa-dev] [PATCH 15/17] radeonsi: always prefetch later shaders after the draw packet
From: Marek Olšákso that the draw is started as soon as possible. v2: only prefetch the API VS and VBO descriptors --- src/gallium/drivers/radeonsi/si_cp_dma.c | 89 +--- src/gallium/drivers/radeonsi/si_pipe.h | 2 +- src/gallium/drivers/radeonsi/si_state_draw.c | 10 +++- 3 files changed, 75 insertions(+), 26 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index 15bd305a350..1e26774ffee 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -514,80 +514,123 @@ static void cik_prefetch_shader_async(struct si_context *sctx, static void cik_prefetch_VBO_descriptors(struct si_context *sctx) { if (!sctx->vertex_elements) return; cik_prefetch_TC_L2_async(sctx, >vb_descriptors_buffer->b.b, sctx->vb_descriptors_offset, sctx->vertex_elements->desc_list_byte_size); } -void cik_emit_prefetch_L2(struct si_context *sctx) +/** + * Prefetch shaders and VBO descriptors. + * + * \param vertex_stage_only Whether only the the API VS and VBO descriptors + * should be prefetched. + */ +void cik_emit_prefetch_L2(struct si_context *sctx, bool vertex_stage_only) { + unsigned mask = sctx->prefetch_L2_mask; + assert(mask); + /* Prefetch shaders and VBO descriptors to TC L2. */ if (sctx->b.chip_class >= GFX9) { /* Choose the right spot for the VBO prefetch. */ if (sctx->tes_shader.cso) { - if (sctx->prefetch_L2_mask & SI_PREFETCH_HS) + if (mask & SI_PREFETCH_HS) cik_prefetch_shader_async(sctx, sctx->queued.named.hs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) cik_prefetch_VBO_descriptors(sctx); - if (sctx->prefetch_L2_mask & SI_PREFETCH_GS) + if (vertex_stage_only) { + sctx->prefetch_L2_mask &= ~(SI_PREFETCH_HS | + SI_PREFETCH_VBO_DESCRIPTORS); + return; + } + + if (mask & SI_PREFETCH_GS) cik_prefetch_shader_async(sctx, sctx->queued.named.gs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) + if (mask & SI_PREFETCH_VS) cik_prefetch_shader_async(sctx, sctx->queued.named.vs); } else if (sctx->gs_shader.cso) { - if (sctx->prefetch_L2_mask & SI_PREFETCH_GS) + if (mask & SI_PREFETCH_GS) cik_prefetch_shader_async(sctx, sctx->queued.named.gs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) cik_prefetch_VBO_descriptors(sctx); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) + if (vertex_stage_only) { + sctx->prefetch_L2_mask &= ~(SI_PREFETCH_GS | + SI_PREFETCH_VBO_DESCRIPTORS); + return; + } + + if (mask & SI_PREFETCH_VS) cik_prefetch_shader_async(sctx, sctx->queued.named.vs); } else { - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) + if (mask & SI_PREFETCH_VS) cik_prefetch_shader_async(sctx, sctx->queued.named.vs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) cik_prefetch_VBO_descriptors(sctx); + if (vertex_stage_only) { + sctx->prefetch_L2_mask &= ~(SI_PREFETCH_VS | + SI_PREFETCH_VBO_DESCRIPTORS); + return; + } } } else { /* SI-CI-VI */ /* Choose the right spot for the VBO prefetch. */ if (sctx->tes_shader.cso) { - if (sctx->prefetch_L2_mask & SI_PREFETCH_LS) + if (mask & SI_PREFETCH_LS) cik_prefetch_shader_async(sctx, sctx->queued.named.ls); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) + if (mask &
Re: [Mesa-dev] [PATCH 15/17] radeonsi: always prefetch later shaders after the draw packet
On Wed, Apr 4, 2018, 6:07 AM Samuel Pitoisetwrote: > > > On 04/04/2018 03:59 AM, Marek Olšák wrote: > > From: Marek Olšák > > > > so that the draw is started as soon as possible. > > --- > > src/gallium/drivers/radeonsi/si_cp_dma.c | 68 > ++-- > > src/gallium/drivers/radeonsi/si_pipe.h | 2 +- > > src/gallium/drivers/radeonsi/si_state_draw.c | 11 - > > src/util/bitscan.h | 8 > > 4 files changed, 61 insertions(+), 28 deletions(-) > > > > diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c > b/src/gallium/drivers/radeonsi/si_cp_dma.c > > index 15bd305a350..ea2c7cf7198 100644 > > --- a/src/gallium/drivers/radeonsi/si_cp_dma.c > > +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c > > @@ -514,80 +514,98 @@ static void cik_prefetch_shader_async(struct > si_context *sctx, > > static void cik_prefetch_VBO_descriptors(struct si_context *sctx) > > { > > if (!sctx->vertex_elements) > > return; > > > > cik_prefetch_TC_L2_async(sctx, >vb_descriptors_buffer->b.b, > >sctx->vb_descriptors_offset, > > > sctx->vertex_elements->desc_list_byte_size); > > } > > > > -void cik_emit_prefetch_L2(struct si_context *sctx) > > +/** > > + * Prefetch shaders and VBO descriptors. > > + * > > + * \param first_two Whether only the first 2 items should be > prefetched, > > + * which are usually the API VS and VBO descriptors. > > + */ > > +void cik_emit_prefetch_L2(struct si_context *sctx, bool first_two) > > { > > + unsigned mask; > > + > > + assert(sctx->prefetch_L2_mask); > > + > > + if (first_two) { > > + mask = 1 << u_bit_scan16(>prefetch_L2_mask); > > + > > + if (sctx->prefetch_L2_mask) > > + mask |= 1 << u_bit_scan16(>prefetch_L2_mask); > > Where do you reset the prefetch L2 mask ? It looks like to me that you > are going to prefetch VS/VBOs twice in the fast draw path. > u_bit_scan16 clears the returned bit. Marek > + } else { > > + mask = sctx->prefetch_L2_mask; > > + sctx->prefetch_L2_mask = 0; > > + } > > + > > /* Prefetch shaders and VBO descriptors to TC L2. */ > > if (sctx->b.chip_class >= GFX9) { > > /* Choose the right spot for the VBO prefetch. */ > > if (sctx->tes_shader.cso) { > > - if (sctx->prefetch_L2_mask & SI_PREFETCH_HS) > > + if (mask & SI_PREFETCH_HS) > > cik_prefetch_shader_async(sctx, > sctx->queued.named.hs); > > - if (sctx->prefetch_L2_mask & > SI_PREFETCH_VBO_DESCRIPTORS) > > + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) > > cik_prefetch_VBO_descriptors(sctx); > > - if (sctx->prefetch_L2_mask & SI_PREFETCH_GS) > > + if (mask & SI_PREFETCH_GS) > > cik_prefetch_shader_async(sctx, sctx-> > queued.named.gs); > > - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) > > + if (mask & SI_PREFETCH_VS) > > cik_prefetch_shader_async(sctx, > sctx->queued.named.vs); > > } else if (sctx->gs_shader.cso) { > > - if (sctx->prefetch_L2_mask & SI_PREFETCH_GS) > > + if (mask & SI_PREFETCH_GS) > > cik_prefetch_shader_async(sctx, sctx-> > queued.named.gs); > > - if (sctx->prefetch_L2_mask & > SI_PREFETCH_VBO_DESCRIPTORS) > > + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) > > cik_prefetch_VBO_descriptors(sctx); > > - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) > > + if (mask & SI_PREFETCH_VS) > > cik_prefetch_shader_async(sctx, > sctx->queued.named.vs); > > } else { > > - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) > > + if (mask & SI_PREFETCH_VS) > > cik_prefetch_shader_async(sctx, > sctx->queued.named.vs); > > - if (sctx->prefetch_L2_mask & > SI_PREFETCH_VBO_DESCRIPTORS) > > + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) > > cik_prefetch_VBO_descriptors(sctx); > > } > > } else { > > /* SI-CI-VI */ > > /* Choose the right spot for the VBO prefetch. */ > > if (sctx->tes_shader.cso) { > > - if (sctx->prefetch_L2_mask & SI_PREFETCH_LS) > > + if (mask & SI_PREFETCH_LS) > > cik_prefetch_shader_async(sctx, sctx-> > queued.named.ls); > > - if (sctx->prefetch_L2_mask & > SI_PREFETCH_VBO_DESCRIPTORS) > > +
Re: [Mesa-dev] [PATCH 15/17] radeonsi: always prefetch later shaders after the draw packet
On 04/04/2018 03:59 AM, Marek Olšák wrote: From: Marek Olšákso that the draw is started as soon as possible. --- src/gallium/drivers/radeonsi/si_cp_dma.c | 68 ++-- src/gallium/drivers/radeonsi/si_pipe.h | 2 +- src/gallium/drivers/radeonsi/si_state_draw.c | 11 - src/util/bitscan.h | 8 4 files changed, 61 insertions(+), 28 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index 15bd305a350..ea2c7cf7198 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -514,80 +514,98 @@ static void cik_prefetch_shader_async(struct si_context *sctx, static void cik_prefetch_VBO_descriptors(struct si_context *sctx) { if (!sctx->vertex_elements) return; cik_prefetch_TC_L2_async(sctx, >vb_descriptors_buffer->b.b, sctx->vb_descriptors_offset, sctx->vertex_elements->desc_list_byte_size); } -void cik_emit_prefetch_L2(struct si_context *sctx) +/** + * Prefetch shaders and VBO descriptors. + * + * \param first_two Whether only the first 2 items should be prefetched, + * which are usually the API VS and VBO descriptors. + */ +void cik_emit_prefetch_L2(struct si_context *sctx, bool first_two) { + unsigned mask; + + assert(sctx->prefetch_L2_mask); + + if (first_two) { + mask = 1 << u_bit_scan16(>prefetch_L2_mask); + + if (sctx->prefetch_L2_mask) + mask |= 1 << u_bit_scan16(>prefetch_L2_mask); Where do you reset the prefetch L2 mask ? It looks like to me that you are going to prefetch VS/VBOs twice in the fast draw path. + } else { + mask = sctx->prefetch_L2_mask; + sctx->prefetch_L2_mask = 0; + } + /* Prefetch shaders and VBO descriptors to TC L2. */ if (sctx->b.chip_class >= GFX9) { /* Choose the right spot for the VBO prefetch. */ if (sctx->tes_shader.cso) { - if (sctx->prefetch_L2_mask & SI_PREFETCH_HS) + if (mask & SI_PREFETCH_HS) cik_prefetch_shader_async(sctx, sctx->queued.named.hs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) cik_prefetch_VBO_descriptors(sctx); - if (sctx->prefetch_L2_mask & SI_PREFETCH_GS) + if (mask & SI_PREFETCH_GS) cik_prefetch_shader_async(sctx, sctx->queued.named.gs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) + if (mask & SI_PREFETCH_VS) cik_prefetch_shader_async(sctx, sctx->queued.named.vs); } else if (sctx->gs_shader.cso) { - if (sctx->prefetch_L2_mask & SI_PREFETCH_GS) + if (mask & SI_PREFETCH_GS) cik_prefetch_shader_async(sctx, sctx->queued.named.gs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) cik_prefetch_VBO_descriptors(sctx); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) + if (mask & SI_PREFETCH_VS) cik_prefetch_shader_async(sctx, sctx->queued.named.vs); } else { - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) + if (mask & SI_PREFETCH_VS) cik_prefetch_shader_async(sctx, sctx->queued.named.vs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) cik_prefetch_VBO_descriptors(sctx); } } else { /* SI-CI-VI */ /* Choose the right spot for the VBO prefetch. */ if (sctx->tes_shader.cso) { - if (sctx->prefetch_L2_mask & SI_PREFETCH_LS) + if (mask & SI_PREFETCH_LS) cik_prefetch_shader_async(sctx, sctx->queued.named.ls); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) cik_prefetch_VBO_descriptors(sctx); - if (sctx->prefetch_L2_mask & SI_PREFETCH_HS) + if (mask & SI_PREFETCH_HS) cik_prefetch_shader_async(sctx, sctx->queued.named.hs); - if
[Mesa-dev] [PATCH 15/17] radeonsi: always prefetch later shaders after the draw packet
From: Marek Olšákso that the draw is started as soon as possible. --- src/gallium/drivers/radeonsi/si_cp_dma.c | 68 ++-- src/gallium/drivers/radeonsi/si_pipe.h | 2 +- src/gallium/drivers/radeonsi/si_state_draw.c | 11 - src/util/bitscan.h | 8 4 files changed, 61 insertions(+), 28 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index 15bd305a350..ea2c7cf7198 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -514,80 +514,98 @@ static void cik_prefetch_shader_async(struct si_context *sctx, static void cik_prefetch_VBO_descriptors(struct si_context *sctx) { if (!sctx->vertex_elements) return; cik_prefetch_TC_L2_async(sctx, >vb_descriptors_buffer->b.b, sctx->vb_descriptors_offset, sctx->vertex_elements->desc_list_byte_size); } -void cik_emit_prefetch_L2(struct si_context *sctx) +/** + * Prefetch shaders and VBO descriptors. + * + * \param first_two Whether only the first 2 items should be prefetched, + * which are usually the API VS and VBO descriptors. + */ +void cik_emit_prefetch_L2(struct si_context *sctx, bool first_two) { + unsigned mask; + + assert(sctx->prefetch_L2_mask); + + if (first_two) { + mask = 1 << u_bit_scan16(>prefetch_L2_mask); + + if (sctx->prefetch_L2_mask) + mask |= 1 << u_bit_scan16(>prefetch_L2_mask); + } else { + mask = sctx->prefetch_L2_mask; + sctx->prefetch_L2_mask = 0; + } + /* Prefetch shaders and VBO descriptors to TC L2. */ if (sctx->b.chip_class >= GFX9) { /* Choose the right spot for the VBO prefetch. */ if (sctx->tes_shader.cso) { - if (sctx->prefetch_L2_mask & SI_PREFETCH_HS) + if (mask & SI_PREFETCH_HS) cik_prefetch_shader_async(sctx, sctx->queued.named.hs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) cik_prefetch_VBO_descriptors(sctx); - if (sctx->prefetch_L2_mask & SI_PREFETCH_GS) + if (mask & SI_PREFETCH_GS) cik_prefetch_shader_async(sctx, sctx->queued.named.gs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) + if (mask & SI_PREFETCH_VS) cik_prefetch_shader_async(sctx, sctx->queued.named.vs); } else if (sctx->gs_shader.cso) { - if (sctx->prefetch_L2_mask & SI_PREFETCH_GS) + if (mask & SI_PREFETCH_GS) cik_prefetch_shader_async(sctx, sctx->queued.named.gs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) cik_prefetch_VBO_descriptors(sctx); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) + if (mask & SI_PREFETCH_VS) cik_prefetch_shader_async(sctx, sctx->queued.named.vs); } else { - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) + if (mask & SI_PREFETCH_VS) cik_prefetch_shader_async(sctx, sctx->queued.named.vs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) cik_prefetch_VBO_descriptors(sctx); } } else { /* SI-CI-VI */ /* Choose the right spot for the VBO prefetch. */ if (sctx->tes_shader.cso) { - if (sctx->prefetch_L2_mask & SI_PREFETCH_LS) + if (mask & SI_PREFETCH_LS) cik_prefetch_shader_async(sctx, sctx->queued.named.ls); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) cik_prefetch_VBO_descriptors(sctx); - if (sctx->prefetch_L2_mask & SI_PREFETCH_HS) + if (mask & SI_PREFETCH_HS) cik_prefetch_shader_async(sctx, sctx->queued.named.hs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_ES) + if (mask & SI_PREFETCH_ES) cik_prefetch_shader_async(sctx, sctx->queued.named.es); -