From: Marek Olšák <marek.ol...@amd.com>

Now draw calls from multiple IBs can be executed in parallel.

v2: do emit partial flushes on SI
v3: invalidate all shader caches at the beginning of IBs

If we artificially limit the number of draw calls per IB to 5, we'll get
a lot more IBs, leading to a lot more partial flushes. Let's see how
the removal of partial flushes changes GPU utilization in that scenario:

With partial flushes (time busy):
    CP: 99%
    SPI: 86%
    CB: 73:

Without partial flushes (time busy):
    CP: 99%
    SPI: 93%
    CB: 81%
---
 src/gallium/drivers/radeonsi/si_hw_context.c | 39 ++++++++++++++++++----------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c 
b/src/gallium/drivers/radeonsi/si_hw_context.c
index 61c8d7067a1..b32b841a628 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -99,27 +99,31 @@ void si_context_gfx_flush(void *context, unsigned flags,
 
        if (!LIST_IS_EMPTY(&ctx->b.active_queries))
                si_suspend_queries(&ctx->b);
 
        ctx->streamout.suspended = false;
        if (ctx->streamout.begin_emitted) {
                si_emit_streamout_end(ctx);
                ctx->streamout.suspended = true;
        }
 
-       ctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
-                       SI_CONTEXT_PS_PARTIAL_FLUSH;
-
-       /* DRM 3.1.0 doesn't flush TC for VI correctly. */
-       if (ctx->b.chip_class == VI && ctx->b.screen->info.drm_minor <= 1)
-               ctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2 |
-                               SI_CONTEXT_INV_VMEM_L1;
+       if (ctx->b.chip_class == VI && ctx->b.screen->info.drm_minor <= 1) {
+               /* DRM 3.1.0 doesn't flush TC for VI correctly. */
+               ctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+                               SI_CONTEXT_CS_PARTIAL_FLUSH |
+                               SI_CONTEXT_INV_GLOBAL_L2;
+       } else if (ctx->b.chip_class == SI) {
+               /* The kernel doesn't wait for idle before flushing and
+                * invalidating TC L2. */
+               ctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+                               SI_CONTEXT_CS_PARTIAL_FLUSH;
+       }
 
        si_emit_cache_flush(ctx);
 
        if (ctx->current_saved_cs) {
                si_trace_emit(ctx);
                si_log_hw_flush(ctx);
 
                /* Save the IB for debug contexts. */
                si_save_cs(ws, cs, &ctx->current_saved_cs->gfx, true);
                ctx->current_saved_cs->flushed = true;
@@ -180,26 +184,35 @@ static void si_begin_cs_debug(struct si_context *ctx)
 
        radeon_add_to_buffer_list(&ctx->b, &ctx->b.gfx, 
ctx->current_saved_cs->trace_buf,
                              RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE);
 }
 
 void si_begin_new_cs(struct si_context *ctx)
 {
        if (ctx->is_debug)
                si_begin_cs_debug(ctx);
 
-       /* Flush read caches at the beginning of CS not flushed by the kernel. 
*/
-       if (ctx->b.chip_class >= CIK)
-               ctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 |
-                               SI_CONTEXT_INV_ICACHE;
-
-       ctx->b.flags |= SI_CONTEXT_START_PIPELINE_STATS;
+       /* Always invalidate caches at the beginning of IBs, because external
+        * users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our
+        * buffers.
+        *
+        * Note that the cache flush done by the kernel at the end of GFX IBs
+        * isn't useful here, because that flush can finish after the following
+        * IB starts drawing.
+        *
+        * TODO: Do we also need to invalidate CB & DB caches?
+        */
+       ctx->b.flags |= SI_CONTEXT_INV_ICACHE |
+                       SI_CONTEXT_INV_SMEM_L1 |
+                       SI_CONTEXT_INV_VMEM_L1 |
+                       SI_CONTEXT_INV_GLOBAL_L2 |
+                       SI_CONTEXT_START_PIPELINE_STATS;
 
        /* set all valid group as dirty so they get reemited on
         * next draw command
         */
        si_pm4_reset_emitted(ctx);
 
        /* The CS initialization should be emitted before everything else. */
        si_pm4_emit(ctx, ctx->init_config);
        if (ctx->init_config_gs_rings)
                si_pm4_emit(ctx, ctx->init_config_gs_rings);
-- 
2.15.1

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to