From: Marek Olšák <marek.ol...@amd.com>

Now draw calls from multiple IBs can be executed in parallel.

v2: do emit partial flushes on SI
v3: invalidate all shader caches at the beginning of IBs
v4: squash with the AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE change,
    don't call si_emit_cache_flush in si_flush_gfx_cs if not needed,
    only do this for flushes invoked internally
v5: empty IBs should wait for idle if the flush requires it

If we artificially limit the number of draw calls per IB to 5, we'll get
a lot more IBs, leading to a lot more partial flushes. Let's see how
the removal of partial flushes changes GPU utilization in that scenario:

With partial flushes (time busy):
    CP: 99%
    SPI: 86%
    CB: 73:

Without partial flushes (time busy):
    CP: 99%
    SPI: 93%
    CB: 81%
---
 src/gallium/drivers/radeon/radeon_winsys.h      |  7 ++++
 src/gallium/drivers/radeonsi/si_buffer.c        |  6 +--
 src/gallium/drivers/radeonsi/si_dma_cs.c        |  2 +-
 src/gallium/drivers/radeonsi/si_fence.c         |  5 ++-
 src/gallium/drivers/radeonsi/si_gfx_cs.c        | 56 ++++++++++++++++++-------
 src/gallium/drivers/radeonsi/si_pipe.h          |  3 +-
 src/gallium/drivers/radeonsi/si_state_shaders.c |  4 +-
 src/gallium/drivers/radeonsi/si_texture.c       |  2 +-
 src/gallium/winsys/amdgpu/drm/amdgpu_bo.c       | 12 ++++--
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.c       | 36 +++++++++++-----
 src/gallium/winsys/radeon/drm/radeon_drm_bo.c   | 12 ++++--
 src/gallium/winsys/radeon/drm/radeon_drm_cs.c   |  3 +-
 12 files changed, 104 insertions(+), 44 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_winsys.h 
b/src/gallium/drivers/radeon/radeon_winsys.h
index 157b2e40550..fae4fb7a95d 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -21,20 +21,27 @@
  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
 
 #ifndef RADEON_WINSYS_H
 #define RADEON_WINSYS_H
 
 /* The public winsys interface header for the radeon driver. */
 
+/* Whether the next IB can start immediately and not wait for draws and
+ * dispatches from the current IB to finish. */
+#define RADEON_FLUSH_START_NEXT_GFX_IB_NOW     (1u << 31)
+
+#define RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW \
+       (PIPE_FLUSH_ASYNC | RADEON_FLUSH_START_NEXT_GFX_IB_NOW)
+
 #include "pipebuffer/pb_buffer.h"
 
 #include "amd/common/ac_gpu_info.h"
 #include "amd/common/ac_surface.h"
 
 /* Tiling flags. */
 enum radeon_bo_layout {
     RADEON_LAYOUT_LINEAR = 0,
     RADEON_LAYOUT_TILED,
     RADEON_LAYOUT_SQUARETILED,
diff --git a/src/gallium/drivers/radeonsi/si_buffer.c 
b/src/gallium/drivers/radeonsi/si_buffer.c
index 1420702d8d4..d17b2c6a831 100644
--- a/src/gallium/drivers/radeonsi/si_buffer.c
+++ b/src/gallium/drivers/radeonsi/si_buffer.c
@@ -57,24 +57,24 @@ void *si_buffer_map_sync_with_rings(struct si_context *sctx,
 
        if (!(usage & PIPE_TRANSFER_WRITE)) {
                /* have to wait for the last write */
                rusage = RADEON_USAGE_WRITE;
        }
 
        if (radeon_emitted(sctx->gfx_cs, sctx->initial_gfx_cs_size) &&
            sctx->ws->cs_is_buffer_referenced(sctx->gfx_cs,
                                                resource->buf, rusage)) {
                if (usage & PIPE_TRANSFER_DONTBLOCK) {
-                       si_flush_gfx_cs(sctx, PIPE_FLUSH_ASYNC, NULL);
+                       si_flush_gfx_cs(sctx, 
RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
                        return NULL;
                } else {
-                       si_flush_gfx_cs(sctx, 0, NULL);
+                       si_flush_gfx_cs(sctx, 
RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
                        busy = true;
                }
        }
        if (radeon_emitted(sctx->dma_cs, 0) &&
            sctx->ws->cs_is_buffer_referenced(sctx->dma_cs,
                                                resource->buf, rusage)) {
                if (usage & PIPE_TRANSFER_DONTBLOCK) {
                        si_flush_dma_cs(sctx, PIPE_FLUSH_ASYNC, NULL);
                        return NULL;
                } else {
@@ -718,21 +718,21 @@ static bool si_resource_commit(struct pipe_context *pctx,
        /*
         * Since buffer commitment changes cannot be pipelined, we need to
         * (a) flush any pending commands that refer to the buffer we're about
         *     to change, and
         * (b) wait for threaded submit to finish, including those that were
         *     triggered by some other, earlier operation.
         */
        if (radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
            ctx->ws->cs_is_buffer_referenced(ctx->gfx_cs,
                                               res->buf, 
RADEON_USAGE_READWRITE)) {
-               si_flush_gfx_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
+               si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, 
NULL);
        }
        if (radeon_emitted(ctx->dma_cs, 0) &&
            ctx->ws->cs_is_buffer_referenced(ctx->dma_cs,
                                               res->buf, 
RADEON_USAGE_READWRITE)) {
                si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
        }
 
        ctx->ws->cs_sync_flush(ctx->dma_cs);
        ctx->ws->cs_sync_flush(ctx->gfx_cs);
 
diff --git a/src/gallium/drivers/radeonsi/si_dma_cs.c 
b/src/gallium/drivers/radeonsi/si_dma_cs.c
index 7af7c5623b7..1eefaeb6ad5 100644
--- a/src/gallium/drivers/radeonsi/si_dma_cs.c
+++ b/src/gallium/drivers/radeonsi/si_dma_cs.c
@@ -51,21 +51,21 @@ void si_need_dma_space(struct si_context *ctx, unsigned 
num_dw,
        }
 
        /* Flush the GFX IB if DMA depends on it. */
        if (radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
            ((dst &&
              ctx->ws->cs_is_buffer_referenced(ctx->gfx_cs, dst->buf,
                                                 RADEON_USAGE_READWRITE)) ||
             (src &&
              ctx->ws->cs_is_buffer_referenced(ctx->gfx_cs, src->buf,
                                                 RADEON_USAGE_WRITE))))
-               si_flush_gfx_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
+               si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, 
NULL);
 
        /* Flush if there's not enough space, or if the memory usage per IB
         * is too large.
         *
         * IBs using too little memory are limited by the IB submission 
overhead.
         * IBs using too much memory are limited by the kernel/TTM overhead.
         * Too long IBs create CPU-GPU pipeline bubbles and add latency.
         *
         * This heuristic makes sure that DMA requests are executed
         * very soon after the call is made and lowers memory usage.
diff --git a/src/gallium/drivers/radeonsi/si_fence.c 
b/src/gallium/drivers/radeonsi/si_fence.c
index 26d6c43b34d..19fcb96041f 100644
--- a/src/gallium/drivers/radeonsi/si_fence.c
+++ b/src/gallium/drivers/radeonsi/si_fence.c
@@ -367,21 +367,24 @@ static boolean si_fence_finish(struct pipe_screen *screen,
                         *     * and the calls to ClientWaitSync and FenceSync 
were
                         *       issued from the same context,
                         *
                         *     then the GL will behave as if the equivalent of 
Flush
                         *     were inserted immediately after the creation of 
sync."
                         *
                         * This means we need to flush for such fences even 
when we're
                         * not going to wait.
                         */
                        threaded_context_unwrap_sync(ctx);
-                       si_flush_gfx_cs(sctx, timeout ? 0 : PIPE_FLUSH_ASYNC, 
NULL);
+                       si_flush_gfx_cs(sctx,
+                                       (timeout ? 0 : PIPE_FLUSH_ASYNC) |
+                                        RADEON_FLUSH_START_NEXT_GFX_IB_NOW,
+                                       NULL);
                        rfence->gfx_unflushed.ctx = NULL;
 
                        if (!timeout)
                                return false;
 
                        /* Recompute the timeout after all that. */
                        if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
                                int64_t time = os_time_get_nano();
                                timeout = abs_timeout > time ? abs_timeout - 
time : 0;
                        }
diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c 
b/src/gallium/drivers/radeonsi/si_gfx_cs.c
index 2d5e510b19e..0173c64631b 100644
--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
@@ -40,47 +40,64 @@ void si_need_gfx_cs_space(struct si_context *ctx)
         */
 
        /* There are two memory usage counters in the winsys for all buffers
         * that have been added (cs_add_buffer) and two counters in the pipe
         * driver for those that haven't been added yet.
         */
        if (unlikely(!radeon_cs_memory_below_limit(ctx->screen, ctx->gfx_cs,
                                                   ctx->vram, ctx->gtt))) {
                ctx->gtt = 0;
                ctx->vram = 0;
-               si_flush_gfx_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
+               si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, 
NULL);
                return;
        }
        ctx->gtt = 0;
        ctx->vram = 0;
 
        /* If the IB is sufficiently large, don't count the space needed
         * and just flush if there is not enough space left.
         *
         * Also reserve space for stopping queries at the end of IB, because
         * the number of active queries is mostly unlimited.
         */
        unsigned need_dwords = 2048 + ctx->num_cs_dw_queries_suspend;
        if (!ctx->ws->cs_check_space(cs, need_dwords))
-               si_flush_gfx_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
+               si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, 
NULL);
 }
 
 void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
                     struct pipe_fence_handle **fence)
 {
        struct radeon_winsys_cs *cs = ctx->gfx_cs;
        struct radeon_winsys *ws = ctx->ws;
+       unsigned wait_flags = 0;
 
        if (ctx->gfx_flush_in_progress)
                return;
 
-       if (!radeon_emitted(cs, ctx->initial_gfx_cs_size))
+       if (ctx->chip_class == VI && ctx->screen->info.drm_minor <= 1) {
+               /* DRM 3.1.0 doesn't flush TC for VI correctly. */
+               wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+                             SI_CONTEXT_CS_PARTIAL_FLUSH |
+                             SI_CONTEXT_INV_GLOBAL_L2;
+       } else if (ctx->chip_class == SI) {
+               /* The kernel flushes L2 before shaders are finished. */
+               wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+                             SI_CONTEXT_CS_PARTIAL_FLUSH;
+       } else if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) {
+               wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+                             SI_CONTEXT_CS_PARTIAL_FLUSH;
+       }
+
+       /* Drop this flush if it's a no-op. */
+       if (!radeon_emitted(cs, ctx->initial_gfx_cs_size) &&
+           (!wait_flags || !ctx->gfx_last_ib_is_busy))
                return;
 
        if (si_check_device_reset(ctx))
                return;
 
        if (ctx->screen->debug_flags & DBG(CHECK_VM))
                flags &= ~PIPE_FLUSH_ASYNC;
 
        /* If the state tracker is flushing the GFX IB, si_flush_from_st is
         * responsible for flushing the DMA IB and merging the fences from both.
@@ -96,27 +113,25 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned 
flags,
 
        if (!LIST_IS_EMPTY(&ctx->active_queries))
                si_suspend_queries(ctx);
 
        ctx->streamout.suspended = false;
        if (ctx->streamout.begin_emitted) {
                si_emit_streamout_end(ctx);
                ctx->streamout.suspended = true;
        }
 
-       ctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
-                       SI_CONTEXT_PS_PARTIAL_FLUSH;
-
-       /* DRM 3.1.0 doesn't flush TC for VI correctly. */
-       if (ctx->chip_class == VI && ctx->screen->info.drm_minor <= 1)
-               ctx->flags |= SI_CONTEXT_INV_GLOBAL_L2 |
-                               SI_CONTEXT_INV_VMEM_L1;
+       if (wait_flags) {
+               ctx->flags |= wait_flags;
+               si_emit_cache_flush(ctx);
+       }
+       ctx->gfx_last_ib_is_busy = wait_flags == 0;
 
        /* Make sure CP DMA is idle at the end of IBs after L2 prefetches
         * because the kernel doesn't wait for it. */
        if (ctx->chip_class >= CIK)
                si_cp_dma_wait_for_idle(ctx);
 
        if (ctx->current_saved_cs) {
                si_trace_emit(ctx);
                si_log_hw_flush(ctx);
 
@@ -180,26 +195,35 @@ static void si_begin_gfx_cs_debug(struct si_context *ctx)
 
        radeon_add_to_buffer_list(ctx, ctx->gfx_cs, 
ctx->current_saved_cs->trace_buf,
                              RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE);
 }
 
 void si_begin_new_gfx_cs(struct si_context *ctx)
 {
        if (ctx->is_debug)
                si_begin_gfx_cs_debug(ctx);
 
-       /* Flush read caches at the beginning of CS not flushed by the kernel. 
*/
-       if (ctx->chip_class >= CIK)
-               ctx->flags |= SI_CONTEXT_INV_SMEM_L1 |
-                               SI_CONTEXT_INV_ICACHE;
-
-       ctx->flags |= SI_CONTEXT_START_PIPELINE_STATS;
+       /* Always invalidate caches at the beginning of IBs, because external
+        * users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our
+        * buffers.
+        *
+        * Note that the cache flush done by the kernel at the end of GFX IBs
+        * isn't useful here, because that flush can finish after the following
+        * IB starts drawing.
+        *
+        * TODO: Do we also need to invalidate CB & DB caches?
+        */
+       ctx->flags |= SI_CONTEXT_INV_ICACHE |
+                     SI_CONTEXT_INV_SMEM_L1 |
+                     SI_CONTEXT_INV_VMEM_L1 |
+                     SI_CONTEXT_INV_GLOBAL_L2 |
+                     SI_CONTEXT_START_PIPELINE_STATS;
 
        /* set all valid group as dirty so they get reemited on
         * next draw command
         */
        si_pm4_reset_emitted(ctx);
 
        /* The CS initialization should be emitted before everything else. */
        si_pm4_emit(ctx, ctx->init_config);
        if (ctx->init_config_gs_rings)
                si_pm4_emit(ctx, ctx->init_config_gs_rings);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index 0c90a6c6e46..5e24d6cbb7e 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -540,20 +540,21 @@ struct si_context {
        void                            *vs_blit_texcoord;
        struct si_screen                *screen;
        struct pipe_debug_callback      debug;
        LLVMTargetMachineRef            tm; /* only non-threaded compilation */
        struct si_shader_ctx_state      fixed_func_tcs_shader;
        struct r600_resource            *wait_mem_scratch;
        unsigned                        wait_mem_number;
        uint16_t                        prefetch_L2_mask;
 
        bool                            gfx_flush_in_progress:1;
+       bool                            gfx_last_ib_is_busy:1;
        bool                            compute_is_busy:1;
 
        unsigned                        num_gfx_cs_flushes;
        unsigned                        initial_gfx_cs_size;
        unsigned                        gpu_reset_counter;
        unsigned                        last_dirty_tex_counter;
        unsigned                        last_compressed_colortex_counter;
        unsigned                        last_num_draw_calls;
        unsigned                        flags; /* flush flags */
        /* Current unaccounted memory usage. */
@@ -1320,19 +1321,19 @@ static inline void
 radeon_add_to_gfx_buffer_list_check_mem(struct si_context *sctx,
                                        struct r600_resource *rbo,
                                        enum radeon_bo_usage usage,
                                        enum radeon_bo_priority priority,
                                        bool check_mem)
 {
        if (check_mem &&
            !radeon_cs_memory_below_limit(sctx->screen, sctx->gfx_cs,
                                          sctx->vram + rbo->vram_usage,
                                          sctx->gtt + rbo->gart_usage))
-               si_flush_gfx_cs(sctx, PIPE_FLUSH_ASYNC, NULL);
+               si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, 
NULL);
 
        radeon_add_to_buffer_list(sctx, sctx->gfx_cs, rbo, usage, priority);
 }
 
 #define PRINT_ERR(fmt, args...) \
        fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, 
##args)
 
 #endif
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c 
b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 7e1660415f5..67ab75bbd2d 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -2767,21 +2767,21 @@ static bool si_update_gs_ring_buffers(struct si_context 
*sctx)
                si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0);
        sctx->init_config_gs_rings = pm4;
 
        if (!sctx->init_config_has_vgt_flush) {
                si_init_config_add_vgt_flush(sctx);
                si_pm4_upload_indirect_buffer(sctx, sctx->init_config);
        }
 
        /* Flush the context to re-emit both init_config states. */
        sctx->initial_gfx_cs_size = 0; /* force flush */
-       si_flush_gfx_cs(sctx, PIPE_FLUSH_ASYNC, NULL);
+       si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
 
        /* Set ring bindings. */
        if (sctx->esgs_ring) {
                assert(sctx->chip_class <= VI);
                si_set_ring_buffer(sctx, SI_ES_RING_ESGS,
                                   sctx->esgs_ring, 0, sctx->esgs_ring->width0,
                                   true, true, 4, 64, 0);
                si_set_ring_buffer(sctx, SI_GS_RING_ESGS,
                                   sctx->esgs_ring, 0, sctx->esgs_ring->width0,
                                   false, false, 0, 0, 0);
@@ -3044,21 +3044,21 @@ static void si_init_tess_factor_ring(struct si_context 
*sctx)
                               factor_va >> 8);
                si_pm4_set_reg(sctx->init_config, R_0089B0_VGT_HS_OFFCHIP_PARAM,
                               sctx->screen->vgt_hs_offchip_param);
        }
 
        /* Flush the context to re-emit the init_config state.
         * This is done only once in a lifetime of a context.
         */
        si_pm4_upload_indirect_buffer(sctx, sctx->init_config);
        sctx->initial_gfx_cs_size = 0; /* force flush */
-       si_flush_gfx_cs(sctx, PIPE_FLUSH_ASYNC, NULL);
+       si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
 }
 
 /**
  * This is used when TCS is NULL in the VS->TCS->TES chain. In this case,
  * VS passes its outputs to TES directly, so the fixed-function shader only
  * has to write TESSOUTER and TESSINNER.
  */
 static void si_generate_fixed_func_tcs(struct si_context *sctx)
 {
        struct ureg_src outer, inner;
diff --git a/src/gallium/drivers/radeonsi/si_texture.c 
b/src/gallium/drivers/radeonsi/si_texture.c
index 1f0de5e71ec..8964c6b730c 100644
--- a/src/gallium/drivers/radeonsi/si_texture.c
+++ b/src/gallium/drivers/radeonsi/si_texture.c
@@ -1862,21 +1862,21 @@ static void si_texture_transfer_unmap(struct 
pipe_context *ctx,
         * The idea is that we don't want to build IBs that use too much
         * memory and put pressure on the kernel memory manager and we also
         * want to make temporary and invalidated buffers go idle ASAP to
         * decrease the total memory usage or make them reusable. The memory
         * usage will be slightly higher than given here because of the buffer
         * cache in the winsys.
         *
         * The result is that the kernel memory manager is never a bottleneck.
         */
        if (sctx->num_alloc_tex_transfer_bytes > sctx->screen->info.gart_size / 
4) {
-               si_flush_gfx_cs(sctx, PIPE_FLUSH_ASYNC, NULL);
+               si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, 
NULL);
                sctx->num_alloc_tex_transfer_bytes = 0;
        }
 
        pipe_resource_reference(&transfer->resource, NULL);
        FREE(transfer);
 }
 
 static const struct u_resource_vtbl si_texture_vtbl =
 {
        NULL,                           /* get_handle */
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c 
b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
index 22b5a73143d..9b6d6e83032 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@@ -232,31 +232,33 @@ static void *amdgpu_bo_map(struct pb_buffer *buf,
          if (!(usage & PIPE_TRANSFER_WRITE)) {
             /* Mapping for read.
              *
              * Since we are mapping for read, we don't need to wait
              * if the GPU is using the buffer for read too
              * (neither one is changing it).
              *
              * Only check whether the buffer is being used for write. */
             if (cs && amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo,
                                                                
RADEON_USAGE_WRITE)) {
-               cs->flush_cs(cs->flush_data, PIPE_FLUSH_ASYNC, NULL);
+               cs->flush_cs(cs->flush_data,
+                           RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
                return NULL;
             }
 
             if (!amdgpu_bo_wait((struct pb_buffer*)bo, 0,
                                 RADEON_USAGE_WRITE)) {
                return NULL;
             }
          } else {
             if (cs && amdgpu_bo_is_referenced_by_cs(cs, bo)) {
-               cs->flush_cs(cs->flush_data, PIPE_FLUSH_ASYNC, NULL);
+               cs->flush_cs(cs->flush_data,
+                           RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
                return NULL;
             }
 
             if (!amdgpu_bo_wait((struct pb_buffer*)bo, 0,
                                 RADEON_USAGE_READWRITE)) {
                return NULL;
             }
          }
       } else {
          uint64_t time = os_time_get_nano();
@@ -265,35 +267,37 @@ static void *amdgpu_bo_map(struct pb_buffer *buf,
             /* Mapping for read.
              *
              * Since we are mapping for read, we don't need to wait
              * if the GPU is using the buffer for read too
              * (neither one is changing it).
              *
              * Only check whether the buffer is being used for write. */
             if (cs) {
                if (amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo,
                                                             
RADEON_USAGE_WRITE)) {
-                  cs->flush_cs(cs->flush_data, 0, NULL);
+                  cs->flush_cs(cs->flush_data,
+                              RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL);
                } else {
                   /* Try to avoid busy-waiting in amdgpu_bo_wait. */
                   if (p_atomic_read(&bo->num_active_ioctls))
                      amdgpu_cs_sync_flush(rcs);
                }
             }
 
             amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
                            RADEON_USAGE_WRITE);
          } else {
             /* Mapping for write. */
             if (cs) {
                if (amdgpu_bo_is_referenced_by_cs(cs, bo)) {
-                  cs->flush_cs(cs->flush_data, 0, NULL);
+                  cs->flush_cs(cs->flush_data,
+                              RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL);
                } else {
                   /* Try to avoid busy-waiting in amdgpu_bo_wait. */
                   if (p_atomic_read(&bo->num_active_ioctls))
                      amdgpu_cs_sync_flush(rcs);
                }
             }
 
             amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
                            RADEON_USAGE_READWRITE);
          }
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c 
b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index a3feeb93026..eb050b8fdb2 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -26,20 +26,24 @@
  * of the Software.
  */
 
 #include "amdgpu_cs.h"
 #include "util/os_time.h"
 #include <inttypes.h>
 #include <stdio.h>
 
 #include "amd/common/sid.h"
 
+#ifndef AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE
+#define AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE (1 << 3)
+#endif
+
 DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", false)
 
 /* FENCES */
 
 static struct pipe_fence_handle *
 amdgpu_fence_create(struct amdgpu_ctx *ctx, unsigned ip_type,
                     unsigned ip_instance, unsigned ring)
 {
    struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
 
@@ -801,56 +805,68 @@ static void amdgpu_set_ib_size(struct amdgpu_ib *ib)
 }
 
 static void amdgpu_ib_finalize(struct amdgpu_winsys *ws, struct amdgpu_ib *ib)
 {
    amdgpu_set_ib_size(ib);
    ib->used_ib_space += ib->base.current.cdw * 4;
    ib->used_ib_space = align(ib->used_ib_space, ws->info.ib_start_alignment);
    ib->max_ib_size = MAX2(ib->max_ib_size, ib->base.prev_dw + 
ib->base.current.cdw);
 }
 
-static bool amdgpu_init_cs_context(struct amdgpu_cs_context *cs,
+static bool amdgpu_init_cs_context(struct amdgpu_winsys *ws,
+                                   struct amdgpu_cs_context *cs,
                                    enum ring_type ring_type)
 {
    switch (ring_type) {
    case RING_DMA:
       cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_DMA;
       break;
 
    case RING_UVD:
       cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_UVD;
       break;
 
    case RING_UVD_ENC:
       cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_UVD_ENC;
       break;
 
    case RING_VCE:
       cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_VCE;
       break;
 
-   case RING_COMPUTE:
-      cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_COMPUTE;
-      break;
-
    case RING_VCN_DEC:
       cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_VCN_DEC;
       break;
 
-  case RING_VCN_ENC:
+   case RING_VCN_ENC:
       cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_VCN_ENC;
       break;
 
-   default:
+   case RING_COMPUTE:
    case RING_GFX:
-      cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_GFX;
+      cs->ib[IB_MAIN].ip_type = ring_type == RING_GFX ? AMDGPU_HW_IP_GFX :
+                                                        AMDGPU_HW_IP_COMPUTE;
+
+      /* The kernel shouldn't invalidate L2 and vL1. The proper place for cache
+       * invalidation is the beginning of IBs (the previous commit does that),
+       * because completion of an IB doesn't care about the state of GPU 
caches,
+       * but the beginning of an IB does. Draw calls from multiple IBs can be
+       * executed in parallel, so draw calls from the current IB can finish 
after
+       * the next IB starts drawing, and so the cache flush at the end of IB
+       * is always late.
+       */
+      if (ws->info.drm_minor >= 26)
+         cs->ib[IB_MAIN].flags = AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE;
       break;
+
+   default:
+      assert(0);
    }
 
    memset(cs->buffer_indices_hashlist, -1, 
sizeof(cs->buffer_indices_hashlist));
    cs->last_added_bo = NULL;
    return true;
 }
 
 static void amdgpu_cs_context_cleanup(struct amdgpu_cs_context *cs)
 {
    unsigned i;
@@ -918,26 +934,26 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx,
    cs->flush_data = flush_ctx;
    cs->ring_type = ring_type;
 
    struct amdgpu_cs_fence_info fence_info;
    fence_info.handle = cs->ctx->user_fence_bo;
    fence_info.offset = cs->ring_type;
    amdgpu_cs_chunk_fence_info_to_data(&fence_info, (void*)&cs->fence_chunk);
 
    cs->main.ib_type = IB_MAIN;
 
-   if (!amdgpu_init_cs_context(&cs->csc1, ring_type)) {
+   if (!amdgpu_init_cs_context(ctx->ws, &cs->csc1, ring_type)) {
       FREE(cs);
       return NULL;
    }
 
-   if (!amdgpu_init_cs_context(&cs->csc2, ring_type)) {
+   if (!amdgpu_init_cs_context(ctx->ws, &cs->csc2, ring_type)) {
       amdgpu_destroy_cs_context(&cs->csc1);
       FREE(cs);
       return NULL;
    }
 
    /* Set the first submission context as current. */
    cs->csc = &cs->csc1;
    cs->cst = &cs->csc2;
 
    if (!amdgpu_get_new_ib(&ctx->ws->base, cs, IB_MAIN)) {
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c 
b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
index 1617a2fe32e..6652977e586 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
@@ -509,60 +509,64 @@ static void *radeon_bo_map(struct pb_buffer *buf,
         if (usage & PIPE_TRANSFER_DONTBLOCK) {
             if (!(usage & PIPE_TRANSFER_WRITE)) {
                 /* Mapping for read.
                  *
                  * Since we are mapping for read, we don't need to wait
                  * if the GPU is using the buffer for read too
                  * (neither one is changing it).
                  *
                  * Only check whether the buffer is being used for write. */
                 if (cs && radeon_bo_is_referenced_by_cs_for_write(cs, bo)) {
-                    cs->flush_cs(cs->flush_data, PIPE_FLUSH_ASYNC, NULL);
+                    cs->flush_cs(cs->flush_data,
+                                RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, 
NULL);
                     return NULL;
                 }
 
                 if (!radeon_bo_wait((struct pb_buffer*)bo, 0,
                                     RADEON_USAGE_WRITE)) {
                     return NULL;
                 }
             } else {
                 if (cs && radeon_bo_is_referenced_by_cs(cs, bo)) {
-                    cs->flush_cs(cs->flush_data, PIPE_FLUSH_ASYNC, NULL);
+                    cs->flush_cs(cs->flush_data,
+                                RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, 
NULL);
                     return NULL;
                 }
 
                 if (!radeon_bo_wait((struct pb_buffer*)bo, 0,
                                     RADEON_USAGE_READWRITE)) {
                     return NULL;
                 }
             }
         } else {
             uint64_t time = os_time_get_nano();
 
             if (!(usage & PIPE_TRANSFER_WRITE)) {
                 /* Mapping for read.
                  *
                  * Since we are mapping for read, we don't need to wait
                  * if the GPU is using the buffer for read too
                  * (neither one is changing it).
                  *
                  * Only check whether the buffer is being used for write. */
                 if (cs && radeon_bo_is_referenced_by_cs_for_write(cs, bo)) {
-                    cs->flush_cs(cs->flush_data, 0, NULL);
+                    cs->flush_cs(cs->flush_data,
+                                RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL);
                 }
                 radeon_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
                                RADEON_USAGE_WRITE);
             } else {
                 /* Mapping for write. */
                 if (cs) {
                     if (radeon_bo_is_referenced_by_cs(cs, bo)) {
-                        cs->flush_cs(cs->flush_data, 0, NULL);
+                        cs->flush_cs(cs->flush_data,
+                                    RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL);
                     } else {
                         /* Try to avoid busy-waiting in radeon_bo_wait. */
                         if (p_atomic_read(&bo->num_active_ioctls))
                             radeon_drm_cs_sync_flush(rcs);
                     }
                 }
 
                 radeon_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
                                RADEON_USAGE_READWRITE);
             }
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c 
b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
index a1975dff8df..9070464bec8 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -400,21 +400,22 @@ static bool radeon_drm_cs_validate(struct 
radeon_winsys_cs *rcs)
         unsigned i;
 
         for (i = cs->csc->num_validated_relocs; i < cs->csc->num_relocs; i++) {
             p_atomic_dec(&cs->csc->relocs_bo[i].bo->num_cs_references);
             radeon_bo_reference(&cs->csc->relocs_bo[i].bo, NULL);
         }
         cs->csc->num_relocs = cs->csc->num_validated_relocs;
 
         /* Flush if there are any relocs. Clean up otherwise. */
         if (cs->csc->num_relocs) {
-            cs->flush_cs(cs->flush_data, PIPE_FLUSH_ASYNC, NULL);
+            cs->flush_cs(cs->flush_data,
+                        RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
         } else {
             radeon_cs_context_cleanup(cs->csc);
             cs->base.used_vram = 0;
             cs->base.used_gart = 0;
 
             assert(cs->base.current.cdw == 0);
             if (cs->base.current.cdw != 0) {
                 fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
             }
         }
-- 
2.15.1

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to