From: Marek Olšák <marek.ol...@amd.com> so that the draw is started as soon as possible. --- src/gallium/drivers/radeonsi/si_cp_dma.c | 68 ++++++++++++++++++---------- src/gallium/drivers/radeonsi/si_pipe.h | 2 +- src/gallium/drivers/radeonsi/si_state_draw.c | 11 ++++- src/util/bitscan.h | 8 ++++ 4 files changed, 61 insertions(+), 28 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index 15bd305a350..ea2c7cf7198 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -514,80 +514,98 @@ static void cik_prefetch_shader_async(struct si_context *sctx, static void cik_prefetch_VBO_descriptors(struct si_context *sctx) { if (!sctx->vertex_elements) return; cik_prefetch_TC_L2_async(sctx, &sctx->vb_descriptors_buffer->b.b, sctx->vb_descriptors_offset, sctx->vertex_elements->desc_list_byte_size); } -void cik_emit_prefetch_L2(struct si_context *sctx) +/** + * Prefetch shaders and VBO descriptors. + * + * \param first_two Whether only the first 2 items should be prefetched, + * which are usually the API VS and VBO descriptors. + */ +void cik_emit_prefetch_L2(struct si_context *sctx, bool first_two) { + unsigned mask; + + assert(sctx->prefetch_L2_mask); + + if (first_two) { + mask = 1 << u_bit_scan16(&sctx->prefetch_L2_mask); + + if (sctx->prefetch_L2_mask) + mask |= 1 << u_bit_scan16(&sctx->prefetch_L2_mask); + } else { + mask = sctx->prefetch_L2_mask; + sctx->prefetch_L2_mask = 0; + } + /* Prefetch shaders and VBO descriptors to TC L2. */ if (sctx->b.chip_class >= GFX9) { /* Choose the right spot for the VBO prefetch. */ if (sctx->tes_shader.cso) { - if (sctx->prefetch_L2_mask & SI_PREFETCH_HS) + if (mask & SI_PREFETCH_HS) cik_prefetch_shader_async(sctx, sctx->queued.named.hs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) cik_prefetch_VBO_descriptors(sctx); - if (sctx->prefetch_L2_mask & SI_PREFETCH_GS) + if (mask & SI_PREFETCH_GS) cik_prefetch_shader_async(sctx, sctx->queued.named.gs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) + if (mask & SI_PREFETCH_VS) cik_prefetch_shader_async(sctx, sctx->queued.named.vs); } else if (sctx->gs_shader.cso) { - if (sctx->prefetch_L2_mask & SI_PREFETCH_GS) + if (mask & SI_PREFETCH_GS) cik_prefetch_shader_async(sctx, sctx->queued.named.gs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) cik_prefetch_VBO_descriptors(sctx); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) + if (mask & SI_PREFETCH_VS) cik_prefetch_shader_async(sctx, sctx->queued.named.vs); } else { - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) + if (mask & SI_PREFETCH_VS) cik_prefetch_shader_async(sctx, sctx->queued.named.vs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) cik_prefetch_VBO_descriptors(sctx); } } else { /* SI-CI-VI */ /* Choose the right spot for the VBO prefetch. */ if (sctx->tes_shader.cso) { - if (sctx->prefetch_L2_mask & SI_PREFETCH_LS) + if (mask & SI_PREFETCH_LS) cik_prefetch_shader_async(sctx, sctx->queued.named.ls); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) cik_prefetch_VBO_descriptors(sctx); - if (sctx->prefetch_L2_mask & SI_PREFETCH_HS) + if (mask & SI_PREFETCH_HS) cik_prefetch_shader_async(sctx, sctx->queued.named.hs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_ES) + if (mask & SI_PREFETCH_ES) cik_prefetch_shader_async(sctx, sctx->queued.named.es); - if (sctx->prefetch_L2_mask & SI_PREFETCH_GS) + if (mask & SI_PREFETCH_GS) cik_prefetch_shader_async(sctx, sctx->queued.named.gs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) + if (mask & SI_PREFETCH_VS) cik_prefetch_shader_async(sctx, sctx->queued.named.vs); } else if (sctx->gs_shader.cso) { - if (sctx->prefetch_L2_mask & SI_PREFETCH_ES) + if (mask & SI_PREFETCH_ES) cik_prefetch_shader_async(sctx, sctx->queued.named.es); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) cik_prefetch_VBO_descriptors(sctx); - if (sctx->prefetch_L2_mask & SI_PREFETCH_GS) + if (mask & SI_PREFETCH_GS) cik_prefetch_shader_async(sctx, sctx->queued.named.gs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) + if (mask & SI_PREFETCH_VS) cik_prefetch_shader_async(sctx, sctx->queued.named.vs); } else { - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) + if (mask & SI_PREFETCH_VS) cik_prefetch_shader_async(sctx, sctx->queued.named.vs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) cik_prefetch_VBO_descriptors(sctx); } } - if (sctx->prefetch_L2_mask & SI_PREFETCH_PS) + if (mask & SI_PREFETCH_PS) cik_prefetch_shader_async(sctx, sctx->queued.named.ps); - - sctx->prefetch_L2_mask = 0; } void si_init_cp_dma_functions(struct si_context *sctx) { sctx->b.b.clear_buffer = si_pipe_clear_buffer; } diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index bb1aebdda42..62641fde5e3 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -688,21 +688,21 @@ enum r600_coherency { void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, uint64_t offset, uint64_t size, unsigned value, enum r600_coherency coher); void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset, unsigned size, unsigned user_flags); void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf, uint64_t offset, unsigned size); -void cik_emit_prefetch_L2(struct si_context *sctx); +void cik_emit_prefetch_L2(struct si_context *sctx, bool first_two); void si_init_cp_dma_functions(struct si_context *sctx); /* si_debug.c */ void si_auto_log_cs(void *data, struct u_log_context *log); void si_log_hw_flush(struct si_context *sctx); void si_log_draw_state(struct si_context *sctx, struct u_log_context *log); void si_log_compute_state(struct si_context *sctx, struct u_log_context *log); void si_init_debug_functions(struct si_context *sctx); void si_check_vm_faults(struct r600_common_context *ctx, struct radeon_saved_cs *saved, enum ring_type ring); diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index 1e79ccca054..8446b1b50bc 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -1450,36 +1450,43 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) sctx->b.render_cond_atom.emit(&sctx->b, NULL); sctx->dirty_atoms = 0; si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset); /* <-- CUs are busy here. */ /* Start prefetches after the draw has been started. Both will run * in parallel, but starting the draw first is more important. */ if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask) - cik_emit_prefetch_L2(sctx); + cik_emit_prefetch_L2(sctx, false); } else { /* If we don't wait for idle, start prefetches first, then set * states, and draw at the end. */ if (sctx->b.flags) si_emit_cache_flush(sctx); + /* Only prefetch the first 2 items, e.g. the API VS and VBO + * descriptors. */ if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask) - cik_emit_prefetch_L2(sctx); + cik_emit_prefetch_L2(sctx, true); if (!si_upload_graphics_shader_descriptors(sctx)) return; si_emit_all_states(sctx, info, 0); si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset); + + /* Prefetch the remaining shaders after the draw has been + * started. */ + if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask) + cik_emit_prefetch_L2(sctx, false); } if (unlikely(sctx->current_saved_cs)) { si_trace_emit(sctx); si_log_draw_state(sctx, sctx->b.log); } /* Workaround for a VGT hang when streamout is enabled. * It must be done after drawing. */ if ((sctx->b.family == CHIP_HAWAII || diff --git a/src/util/bitscan.h b/src/util/bitscan.h index 5cc75f0beba..78ff8e0cea1 100644 --- a/src/util/bitscan.h +++ b/src/util/bitscan.h @@ -89,20 +89,28 @@ ffsll(long long int val); /* Destructively loop over all of the bits in a mask as in: * * while (mymask) { * int i = u_bit_scan(&mymask); * ... process element i * } * */ +static inline int +u_bit_scan16(uint16_t *mask) +{ + const int i = ffs(*mask) - 1; + *mask ^= (1u << i); + return i; +} + static inline int u_bit_scan(unsigned *mask) { const int i = ffs(*mask) - 1; *mask ^= (1u << i); return i; } static inline int u_bit_scan64(uint64_t *mask) -- 2.15.1 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev