From: Marek Olšák <marek.ol...@amd.com>

so that the draw is started as soon as possible.
---
 src/gallium/drivers/radeonsi/si_cp_dma.c     | 68 ++++++++++++++++++----------
 src/gallium/drivers/radeonsi/si_pipe.h       |  2 +-
 src/gallium/drivers/radeonsi/si_state_draw.c | 11 ++++-
 src/util/bitscan.h                           |  8 ++++
 4 files changed, 61 insertions(+), 28 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c 
b/src/gallium/drivers/radeonsi/si_cp_dma.c
index 15bd305a350..ea2c7cf7198 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -514,80 +514,98 @@ static void cik_prefetch_shader_async(struct si_context 
*sctx,
 static void cik_prefetch_VBO_descriptors(struct si_context *sctx)
 {
        if (!sctx->vertex_elements)
                return;
 
        cik_prefetch_TC_L2_async(sctx, &sctx->vb_descriptors_buffer->b.b,
                                 sctx->vb_descriptors_offset,
                                 sctx->vertex_elements->desc_list_byte_size);
 }
 
-void cik_emit_prefetch_L2(struct si_context *sctx)
+/**
+ * Prefetch shaders and VBO descriptors.
+ *
+ * \param first_two  Whether only the first 2 items should be prefetched,
+ *                   which are usually the API VS and VBO descriptors.
+ */
+void cik_emit_prefetch_L2(struct si_context *sctx, bool first_two)
 {
+       unsigned mask;
+
+       assert(sctx->prefetch_L2_mask);
+
+       if (first_two) {
+               mask = 1 << u_bit_scan16(&sctx->prefetch_L2_mask);
+
+               if (sctx->prefetch_L2_mask)
+                       mask |= 1 << u_bit_scan16(&sctx->prefetch_L2_mask);
+       } else {
+               mask = sctx->prefetch_L2_mask;
+               sctx->prefetch_L2_mask = 0;
+       }
+
        /* Prefetch shaders and VBO descriptors to TC L2. */
        if (sctx->b.chip_class >= GFX9) {
                /* Choose the right spot for the VBO prefetch. */
                if (sctx->tes_shader.cso) {
-                       if (sctx->prefetch_L2_mask & SI_PREFETCH_HS)
+                       if (mask & SI_PREFETCH_HS)
                                cik_prefetch_shader_async(sctx, 
sctx->queued.named.hs);
-                       if (sctx->prefetch_L2_mask & 
SI_PREFETCH_VBO_DESCRIPTORS)
+                       if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
                                cik_prefetch_VBO_descriptors(sctx);
-                       if (sctx->prefetch_L2_mask & SI_PREFETCH_GS)
+                       if (mask & SI_PREFETCH_GS)
                                cik_prefetch_shader_async(sctx, 
sctx->queued.named.gs);
-                       if (sctx->prefetch_L2_mask & SI_PREFETCH_VS)
+                       if (mask & SI_PREFETCH_VS)
                                cik_prefetch_shader_async(sctx, 
sctx->queued.named.vs);
                } else if (sctx->gs_shader.cso) {
-                       if (sctx->prefetch_L2_mask & SI_PREFETCH_GS)
+                       if (mask & SI_PREFETCH_GS)
                                cik_prefetch_shader_async(sctx, 
sctx->queued.named.gs);
-                       if (sctx->prefetch_L2_mask & 
SI_PREFETCH_VBO_DESCRIPTORS)
+                       if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
                                cik_prefetch_VBO_descriptors(sctx);
-                       if (sctx->prefetch_L2_mask & SI_PREFETCH_VS)
+                       if (mask & SI_PREFETCH_VS)
                                cik_prefetch_shader_async(sctx, 
sctx->queued.named.vs);
                } else {
-                       if (sctx->prefetch_L2_mask & SI_PREFETCH_VS)
+                       if (mask & SI_PREFETCH_VS)
                                cik_prefetch_shader_async(sctx, 
sctx->queued.named.vs);
-                       if (sctx->prefetch_L2_mask & 
SI_PREFETCH_VBO_DESCRIPTORS)
+                       if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
                                cik_prefetch_VBO_descriptors(sctx);
                }
        } else {
                /* SI-CI-VI */
                /* Choose the right spot for the VBO prefetch. */
                if (sctx->tes_shader.cso) {
-                       if (sctx->prefetch_L2_mask & SI_PREFETCH_LS)
+                       if (mask & SI_PREFETCH_LS)
                                cik_prefetch_shader_async(sctx, 
sctx->queued.named.ls);
-                       if (sctx->prefetch_L2_mask & 
SI_PREFETCH_VBO_DESCRIPTORS)
+                       if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
                                cik_prefetch_VBO_descriptors(sctx);
-                       if (sctx->prefetch_L2_mask & SI_PREFETCH_HS)
+                       if (mask & SI_PREFETCH_HS)
                                cik_prefetch_shader_async(sctx, 
sctx->queued.named.hs);
-                       if (sctx->prefetch_L2_mask & SI_PREFETCH_ES)
+                       if (mask & SI_PREFETCH_ES)
                                cik_prefetch_shader_async(sctx, 
sctx->queued.named.es);
-                       if (sctx->prefetch_L2_mask & SI_PREFETCH_GS)
+                       if (mask & SI_PREFETCH_GS)
                                cik_prefetch_shader_async(sctx, 
sctx->queued.named.gs);
-                       if (sctx->prefetch_L2_mask & SI_PREFETCH_VS)
+                       if (mask & SI_PREFETCH_VS)
                                cik_prefetch_shader_async(sctx, 
sctx->queued.named.vs);
                } else if (sctx->gs_shader.cso) {
-                       if (sctx->prefetch_L2_mask & SI_PREFETCH_ES)
+                       if (mask & SI_PREFETCH_ES)
                                cik_prefetch_shader_async(sctx, 
sctx->queued.named.es);
-                       if (sctx->prefetch_L2_mask & 
SI_PREFETCH_VBO_DESCRIPTORS)
+                       if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
                                cik_prefetch_VBO_descriptors(sctx);
-                       if (sctx->prefetch_L2_mask & SI_PREFETCH_GS)
+                       if (mask & SI_PREFETCH_GS)
                                cik_prefetch_shader_async(sctx, 
sctx->queued.named.gs);
-                       if (sctx->prefetch_L2_mask & SI_PREFETCH_VS)
+                       if (mask & SI_PREFETCH_VS)
                                cik_prefetch_shader_async(sctx, 
sctx->queued.named.vs);
                } else {
-                       if (sctx->prefetch_L2_mask & SI_PREFETCH_VS)
+                       if (mask & SI_PREFETCH_VS)
                                cik_prefetch_shader_async(sctx, 
sctx->queued.named.vs);
-                       if (sctx->prefetch_L2_mask & 
SI_PREFETCH_VBO_DESCRIPTORS)
+                       if (mask & SI_PREFETCH_VBO_DESCRIPTORS)
                                cik_prefetch_VBO_descriptors(sctx);
                }
        }
 
-       if (sctx->prefetch_L2_mask & SI_PREFETCH_PS)
+       if (mask & SI_PREFETCH_PS)
                cik_prefetch_shader_async(sctx, sctx->queued.named.ps);
-
-       sctx->prefetch_L2_mask = 0;
 }
 
 void si_init_cp_dma_functions(struct si_context *sctx)
 {
        sctx->b.b.clear_buffer = si_pipe_clear_buffer;
 }
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index bb1aebdda42..62641fde5e3 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -688,21 +688,21 @@ enum r600_coherency {
 
 void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
                     uint64_t offset, uint64_t size, unsigned value,
                     enum r600_coherency coher);
 void si_copy_buffer(struct si_context *sctx,
                    struct pipe_resource *dst, struct pipe_resource *src,
                    uint64_t dst_offset, uint64_t src_offset, unsigned size,
                    unsigned user_flags);
 void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource 
*buf,
                              uint64_t offset, unsigned size);
-void cik_emit_prefetch_L2(struct si_context *sctx);
+void cik_emit_prefetch_L2(struct si_context *sctx, bool first_two);
 void si_init_cp_dma_functions(struct si_context *sctx);
 
 /* si_debug.c */
 void si_auto_log_cs(void *data, struct u_log_context *log);
 void si_log_hw_flush(struct si_context *sctx);
 void si_log_draw_state(struct si_context *sctx, struct u_log_context *log);
 void si_log_compute_state(struct si_context *sctx, struct u_log_context *log);
 void si_init_debug_functions(struct si_context *sctx);
 void si_check_vm_faults(struct r600_common_context *ctx,
                        struct radeon_saved_cs *saved, enum ring_type ring);
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c 
b/src/gallium/drivers/radeonsi/si_state_draw.c
index 1e79ccca054..8446b1b50bc 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -1450,36 +1450,43 @@ void si_draw_vbo(struct pipe_context *ctx, const struct 
pipe_draw_info *info)
                        sctx->b.render_cond_atom.emit(&sctx->b, NULL);
                sctx->dirty_atoms = 0;
 
                si_emit_draw_packets(sctx, info, indexbuf, index_size, 
index_offset);
                /* <-- CUs are busy here. */
 
                /* Start prefetches after the draw has been started. Both will 
run
                 * in parallel, but starting the draw first is more important.
                 */
                if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask)
-                       cik_emit_prefetch_L2(sctx);
+                       cik_emit_prefetch_L2(sctx, false);
        } else {
                /* If we don't wait for idle, start prefetches first, then set
                 * states, and draw at the end.
                 */
                if (sctx->b.flags)
                        si_emit_cache_flush(sctx);
 
+               /* Only prefetch the first 2 items, e.g. the API VS and VBO
+                * descriptors. */
                if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask)
-                       cik_emit_prefetch_L2(sctx);
+                       cik_emit_prefetch_L2(sctx, true);
 
                if (!si_upload_graphics_shader_descriptors(sctx))
                        return;
 
                si_emit_all_states(sctx, info, 0);
                si_emit_draw_packets(sctx, info, indexbuf, index_size, 
index_offset);
+
+               /* Prefetch the remaining shaders after the draw has been
+                * started. */
+               if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask)
+                       cik_emit_prefetch_L2(sctx, false);
        }
 
        if (unlikely(sctx->current_saved_cs)) {
                si_trace_emit(sctx);
                si_log_draw_state(sctx, sctx->b.log);
        }
 
        /* Workaround for a VGT hang when streamout is enabled.
         * It must be done after drawing. */
        if ((sctx->b.family == CHIP_HAWAII ||
diff --git a/src/util/bitscan.h b/src/util/bitscan.h
index 5cc75f0beba..78ff8e0cea1 100644
--- a/src/util/bitscan.h
+++ b/src/util/bitscan.h
@@ -89,20 +89,28 @@ ffsll(long long int val);
 
 
 /* Destructively loop over all of the bits in a mask as in:
  *
  * while (mymask) {
  *   int i = u_bit_scan(&mymask);
  *   ... process element i
  * }
  *
  */
+static inline int
+u_bit_scan16(uint16_t *mask)
+{
+   const int i = ffs(*mask) - 1;
+   *mask ^= (1u << i);
+   return i;
+}
+
 static inline int
 u_bit_scan(unsigned *mask)
 {
    const int i = ffs(*mask) - 1;
    *mask ^= (1u << i);
    return i;
 }
 
 static inline int
 u_bit_scan64(uint64_t *mask)
-- 
2.15.1

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to