From: Marek Olšák <marek.ol...@amd.com>

SMEM and VMEM caches are L0 on gfx10.
---
 src/gallium/drivers/radeonsi/si_compute.c     |  2 +-
 .../drivers/radeonsi/si_compute_blit.c        | 12 +++---
 src/gallium/drivers/radeonsi/si_descriptors.c |  2 +-
 src/gallium/drivers/radeonsi/si_gfx_cs.c      |  8 ++--
 src/gallium/drivers/radeonsi/si_pipe.c        |  8 ++--
 src/gallium/drivers/radeonsi/si_pipe.h        | 34 +++++++++--------
 src/gallium/drivers/radeonsi/si_state.c       | 14 +++----
 src/gallium/drivers/radeonsi/si_state_draw.c  | 38 +++++++++----------
 .../drivers/radeonsi/si_state_streamout.c     |  6 +--
 .../drivers/radeonsi/si_test_dma_perf.c       |  6 +--
 10 files changed, 66 insertions(+), 64 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index 7e5259b70a0..63c95ed2604 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -910,21 +910,21 @@ static void si_launch_grid(
        /* Add buffer sizes for memory checking in need_cs_space. */
        si_context_add_resource_size(sctx, &program->shader.bo->b.b);
        /* TODO: add the scratch buffer */
 
        if (info->indirect) {
                si_context_add_resource_size(sctx, info->indirect);
 
                /* Indirect buffers use TC L2 on GFX9, but not older hw. */
                if (sctx->chip_class <= GFX8 &&
                    si_resource(info->indirect)->TC_L2_dirty) {
-                       sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
+                       sctx->flags |= SI_CONTEXT_WB_L2;
                        si_resource(info->indirect)->TC_L2_dirty = false;
                }
        }
 
        si_need_gfx_cs_space(sctx);
 
        if (sctx->bo_list_add_all_compute_resources)
                si_compute_resources_add_all_to_bo_list(sctx);
 
        if (!sctx->cs_shader_state.initialized) {
diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c 
b/src/gallium/drivers/radeonsi/si_compute_blit.c
index 1cfdc9b62c6..4c5464ac118 100644
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -44,23 +44,23 @@ static enum si_cache_policy get_cache_policy(struct 
si_context *sctx,
 
 unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
                            enum si_cache_policy cache_policy)
 {
        switch (coher) {
        default:
        case SI_COHERENCY_NONE:
        case SI_COHERENCY_CP:
                return 0;
        case SI_COHERENCY_SHADER:
-               return SI_CONTEXT_INV_SMEM_L1 |
-                      SI_CONTEXT_INV_VMEM_L1 |
-                      (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_GLOBAL_L2 : 
0);
+               return SI_CONTEXT_INV_SCACHE |
+                      SI_CONTEXT_INV_VCACHE |
+                      (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_L2 : 0);
        case SI_COHERENCY_CB_META:
                return SI_CONTEXT_FLUSH_AND_INV_CB;
        }
 }
 
 static void si_compute_internal_begin(struct si_context *sctx)
 {
        sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS;
        sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
        sctx->render_cond_force_off = true;
@@ -165,21 +165,21 @@ static void si_compute_do_clear_or_copy(struct si_context 
*sctx,
                                                             
SI_COMPUTE_CLEAR_DW_PER_THREAD,
                                                             
shader_dst_stream_policy, false);
                }
                ctx->bind_compute_state(ctx, sctx->cs_clear_buffer);
        }
 
        ctx->launch_grid(ctx, &info);
 
        enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
        sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
-                      (cache_policy == L2_BYPASS ? 
SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0);
+                      (cache_policy == L2_BYPASS ? SI_CONTEXT_WB_L2 : 0);
 
        if (cache_policy != L2_BYPASS)
                si_resource(dst)->TC_L2_dirty = true;
 
        /* Restore states. */
        ctx->bind_compute_state(ctx, saved_cs);
        ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, 
saved_sb,
                                saved_writable_mask);
        si_compute_internal_end(sctx);
 }
@@ -411,21 +411,21 @@ void si_compute_copy_image(struct si_context *sctx,
                info.last_block[1] = height % 8;
                info.block[2] = 1;
                info.grid[0] = DIV_ROUND_UP(width, 8);
                info.grid[1] = DIV_ROUND_UP(height, 8);
                info.grid[2] = depth;
        }
 
        ctx->launch_grid(ctx, &info);
 
        sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
-                      (sctx->chip_class <= GFX8 ? 
SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0) |
+                      (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) |
                       si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
        ctx->bind_compute_state(ctx, saved_cs);
        ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, saved_image);
        ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
        si_compute_internal_end(sctx);
 }
 
 void si_retile_dcc(struct si_context *sctx, struct si_texture *tex)
 {
        struct pipe_context *ctx = &sctx->b;
@@ -590,17 +590,17 @@ void si_compute_clear_render_target(struct pipe_context 
*ctx,
                info.block[1] = 1;
                info.block[2] = 1;
                info.grid[0] = DIV_ROUND_UP(width, 64);
                info.grid[1] = num_layers;
                info.grid[2] = 1;
        }
 
        ctx->launch_grid(ctx, &info);
 
        sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
-                      (sctx->chip_class <= GFX8 ? 
SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0) |
+                      (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) |
                       si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
        ctx->bind_compute_state(ctx, saved_cs);
        ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_image);
        ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
        si_compute_internal_end(sctx);
 }
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c 
b/src/gallium/drivers/radeonsi/si_descriptors.c
index 37d92fa7363..2a13ffd32f9 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -1874,21 +1874,21 @@ static void si_upload_bindless_descriptors(struct 
si_context *sctx)
                unsigned desc_slot = (*img_handle)->desc_slot;
 
                if (!(*img_handle)->desc_dirty)
                        continue;
 
                si_upload_bindless_descriptor(sctx, desc_slot, 8);
                (*img_handle)->desc_dirty = false;
        }
 
        /* Invalidate L1 because it doesn't know that L2 changed. */
-       sctx->flags |= SI_CONTEXT_INV_SMEM_L1;
+       sctx->flags |= SI_CONTEXT_INV_SCACHE;
        si_emit_cache_flush(sctx);
 
        sctx->bindless_descriptors_dirty = false;
 }
 
 /* Update mutable image descriptor fields of all resident textures. */
 static void si_update_bindless_texture_descriptor(struct si_context *sctx,
                                                  struct si_texture_handle 
*tex_handle)
 {
        struct si_sampler_view *sview = (struct si_sampler_view 
*)tex_handle->view;
diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c 
b/src/gallium/drivers/radeonsi/si_gfx_cs.c
index de0909904c8..9386df3a615 100644
--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
@@ -76,21 +76,21 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
        struct radeon_cmdbuf *cs = ctx->gfx_cs;
        struct radeon_winsys *ws = ctx->ws;
        unsigned wait_flags = 0;
 
        if (ctx->gfx_flush_in_progress)
                return;
 
        if (!ctx->screen->info.kernel_flushes_tc_l2_after_ib) {
                wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
                              SI_CONTEXT_CS_PARTIAL_FLUSH |
-                             SI_CONTEXT_INV_GLOBAL_L2;
+                             SI_CONTEXT_INV_L2;
        } else if (ctx->chip_class == GFX6) {
                /* The kernel flushes L2 before shaders are finished. */
                wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
                              SI_CONTEXT_CS_PARTIAL_FLUSH;
        } else if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) {
                wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
                              SI_CONTEXT_CS_PARTIAL_FLUSH;
        }
 
        /* Drop this flush if it's a no-op. */
@@ -297,23 +297,23 @@ void si_begin_new_gfx_cs(struct si_context *ctx)
         * users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our
         * buffers.
         *
         * Note that the cache flush done by the kernel at the end of GFX IBs
         * isn't useful here, because that flush can finish after the following
         * IB starts drawing.
         *
         * TODO: Do we also need to invalidate CB & DB caches?
         */
        ctx->flags |= SI_CONTEXT_INV_ICACHE |
-                     SI_CONTEXT_INV_SMEM_L1 |
-                     SI_CONTEXT_INV_VMEM_L1 |
-                     SI_CONTEXT_INV_GLOBAL_L2 |
+                     SI_CONTEXT_INV_SCACHE |
+                     SI_CONTEXT_INV_VCACHE |
+                     SI_CONTEXT_INV_L2 |
                      SI_CONTEXT_START_PIPELINE_STATS;
 
        ctx->cs_shader_state.initialized = false;
        si_all_descriptors_begin_new_cs(ctx);
 
        if (!ctx->has_graphics) {
                ctx->initial_gfx_cs_size = ctx->gfx_cs->current.cdw;
                return;
        }
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c 
b/src/gallium/drivers/radeonsi/si_pipe.c
index d0d04bbb3de..31a9d92461f 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -1148,25 +1148,25 @@ radeonsi_screen_create_impl(struct radeon_winsys *ws,
 
        sscreen->dcc_msaa_allowed =
                !(sscreen->debug_flags & DBG(NO_DCC_MSAA));
 
        sscreen->cpdma_prefetch_writes_memory = sscreen->info.chip_class <= 
GFX8;
 
        (void) mtx_init(&sscreen->shader_parts_mutex, mtx_plain);
        sscreen->use_monolithic_shaders =
                (sscreen->debug_flags & DBG(MONOLITHIC_SHADERS)) != 0;
 
-       sscreen->barrier_flags.cp_to_L2 = SI_CONTEXT_INV_SMEM_L1 |
-                                           SI_CONTEXT_INV_VMEM_L1;
+       sscreen->barrier_flags.cp_to_L2 = SI_CONTEXT_INV_SCACHE |
+                                         SI_CONTEXT_INV_VCACHE;
        if (sscreen->info.chip_class <= GFX8) {
-               sscreen->barrier_flags.cp_to_L2 |= SI_CONTEXT_INV_GLOBAL_L2;
-               sscreen->barrier_flags.L2_to_cp |= 
SI_CONTEXT_WRITEBACK_GLOBAL_L2;
+               sscreen->barrier_flags.cp_to_L2 |= SI_CONTEXT_INV_L2;
+               sscreen->barrier_flags.L2_to_cp |= SI_CONTEXT_WB_L2;
        }
 
        if (debug_get_bool_option("RADEON_DUMP_SHADERS", false))
                sscreen->debug_flags |= DBG_ALL_SHADERS;
 
        /* Syntax:
         *     EQAA=s,z,c
         * Example:
         *     EQAA=8,4,2
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index 368cb4e473d..11678e1b4cb 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -58,30 +58,32 @@
 #define SI_COMPUTE_CLEAR_DW_PER_THREAD 4
 #define SI_COMPUTE_COPY_DW_PER_THREAD  4
 #define SI_COMPUTE_DST_CACHE_POLICY    L2_STREAM
 
 /* Pipeline & streamout query controls. */
 #define SI_CONTEXT_START_PIPELINE_STATS        (1 << 0)
 #define SI_CONTEXT_STOP_PIPELINE_STATS (1 << 1)
 #define SI_CONTEXT_FLUSH_FOR_RENDER_COND (1 << 2)
 /* Instruction cache. */
 #define SI_CONTEXT_INV_ICACHE          (1 << 3)
-/* SMEM L1, other names: KCACHE, constant cache, DCACHE, data cache */
-#define SI_CONTEXT_INV_SMEM_L1         (1 << 4)
-/* VMEM L1 can optionally be bypassed (GLC=1). Other names: TC L1 */
-#define SI_CONTEXT_INV_VMEM_L1         (1 << 5)
-/* Used by everything except CB/DB, can be bypassed (SLC=1). Other names: TC 
L2 */
-#define SI_CONTEXT_INV_GLOBAL_L2       (1 << 6)
-/* Write dirty L2 lines back to memory (shader and CP DMA stores), but don't
- * invalidate L2. GFX6-GFX7 can't do it, so they will do complete 
invalidation. */
-#define SI_CONTEXT_WRITEBACK_GLOBAL_L2 (1 << 7)
-/* Writeback & invalidate the L2 metadata cache. It can only be coupled with
+/* Scalar L1 cache. */
+#define SI_CONTEXT_INV_SCACHE          (1 << 4)
+/* Vector L1 cache. */
+#define SI_CONTEXT_INV_VCACHE          (1 << 5)
+/* L2 cache + L2 metadata cache writeback & invalidate.
+ * GFX6-8: Used by shaders only. GFX9-10: Used by everything. */
+#define SI_CONTEXT_INV_L2              (1 << 6)
+/* L2 writeback (write dirty L2 lines to memory for non-L2 clients).
+ * Only used for coherency with non-L2 clients like CB, DB, CP on GFX6-8.
+ * GFX6-7 will do complete invalidation, because the writeback is unsupported. 
*/
+#define SI_CONTEXT_WB_L2               (1 << 7)
+/* Writeback & invalidate the L2 metadata cache only. It can only be coupled 
with
  * a CB or DB flush. */
 #define SI_CONTEXT_INV_L2_METADATA     (1 << 8)
 /* Framebuffer caches. */
 #define SI_CONTEXT_FLUSH_AND_INV_DB    (1 << 9)
 #define SI_CONTEXT_FLUSH_AND_INV_DB_META (1 << 10)
 #define SI_CONTEXT_FLUSH_AND_INV_CB    (1 << 11)
 /* Engine synchronization. */
 #define SI_CONTEXT_VS_PARTIAL_FLUSH    (1 << 12)
 #define SI_CONTEXT_PS_PARTIAL_FLUSH    (1 << 13)
 #define SI_CONTEXT_CS_PARTIAL_FLUSH    (1 << 14)
@@ -1639,57 +1641,57 @@ si_saved_cs_reference(struct si_saved_cs **dst, struct 
si_saved_cs *src)
                si_destroy_saved_cs(*dst);
 
        *dst = src;
 }
 
 static inline void
 si_make_CB_shader_coherent(struct si_context *sctx, unsigned num_samples,
                           bool shaders_read_metadata, bool dcc_pipe_aligned)
 {
        sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB |
-                      SI_CONTEXT_INV_VMEM_L1;
+                      SI_CONTEXT_INV_VCACHE;
 
        if (sctx->chip_class >= GFX9) {
                /* Single-sample color is coherent with shaders on GFX9, but
                 * L2 metadata must be flushed if shaders read metadata.
                 * (DCC, CMASK).
                 */
                if (num_samples >= 2 ||
                    (shaders_read_metadata && !dcc_pipe_aligned))
-                       sctx->flags |= SI_CONTEXT_INV_GLOBAL_L2;
+                       sctx->flags |= SI_CONTEXT_INV_L2;
                else if (shaders_read_metadata)
                        sctx->flags |= SI_CONTEXT_INV_L2_METADATA;
        } else {
                /* GFX6-GFX8 */
-               sctx->flags |= SI_CONTEXT_INV_GLOBAL_L2;
+               sctx->flags |= SI_CONTEXT_INV_L2;
        }
 }
 
 static inline void
 si_make_DB_shader_coherent(struct si_context *sctx, unsigned num_samples,
                           bool include_stencil, bool shaders_read_metadata)
 {
        sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB |
-                      SI_CONTEXT_INV_VMEM_L1;
+                      SI_CONTEXT_INV_VCACHE;
 
        if (sctx->chip_class >= GFX9) {
                /* Single-sample depth (not stencil) is coherent with shaders
                 * on GFX9, but L2 metadata must be flushed if shaders read
                 * metadata.
                 */
                if (num_samples >= 2 || include_stencil)
-                       sctx->flags |= SI_CONTEXT_INV_GLOBAL_L2;
+                       sctx->flags |= SI_CONTEXT_INV_L2;
                else if (shaders_read_metadata)
                        sctx->flags |= SI_CONTEXT_INV_L2_METADATA;
        } else {
                /* GFX6-GFX8 */
-               sctx->flags |= SI_CONTEXT_INV_GLOBAL_L2;
+               sctx->flags |= SI_CONTEXT_INV_L2;
        }
 }
 
 static inline bool
 si_can_sample_zs(struct si_texture *tex, bool stencil_sampler)
 {
        return (stencil_sampler && tex->can_sample_s) ||
               (!stencil_sampler && tex->can_sample_z);
 }
 
diff --git a/src/gallium/drivers/radeonsi/si_state.c 
b/src/gallium/drivers/radeonsi/si_state.c
index af21914a142..b9fc77f7918 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -4785,61 +4785,61 @@ static void si_texture_barrier(struct pipe_context 
*ctx, unsigned flags)
 static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
 {
        struct si_context *sctx = (struct si_context *)ctx;
 
        if (!(flags & ~PIPE_BARRIER_UPDATE))
                return;
 
        /* Subsequent commands must wait for all shader invocations to
         * complete. */
        sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
-                        SI_CONTEXT_CS_PARTIAL_FLUSH;
+                      SI_CONTEXT_CS_PARTIAL_FLUSH;
 
        if (flags & PIPE_BARRIER_CONSTANT_BUFFER)
-               sctx->flags |= SI_CONTEXT_INV_SMEM_L1 |
-                                SI_CONTEXT_INV_VMEM_L1;
+               sctx->flags |= SI_CONTEXT_INV_SCACHE |
+                              SI_CONTEXT_INV_VCACHE;
 
        if (flags & (PIPE_BARRIER_VERTEX_BUFFER |
                     PIPE_BARRIER_SHADER_BUFFER |
                     PIPE_BARRIER_TEXTURE |
                     PIPE_BARRIER_IMAGE |
                     PIPE_BARRIER_STREAMOUT_BUFFER |
                     PIPE_BARRIER_GLOBAL_BUFFER)) {
                /* As far as I can tell, L1 contents are written back to L2
                 * automatically at end of shader, but the contents of other
                 * L1 caches might still be stale. */
-               sctx->flags |= SI_CONTEXT_INV_VMEM_L1;
+               sctx->flags |= SI_CONTEXT_INV_VCACHE;
        }
 
        if (flags & PIPE_BARRIER_INDEX_BUFFER) {
                /* Indices are read through TC L2 since GFX8.
                 * L1 isn't used.
                 */
                if (sctx->screen->info.chip_class <= GFX7)
-                       sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
+                       sctx->flags |= SI_CONTEXT_WB_L2;
        }
 
        /* MSAA color, any depth and any stencil are flushed in
         * si_decompress_textures when needed.
         */
        if (flags & PIPE_BARRIER_FRAMEBUFFER &&
            sctx->framebuffer.uncompressed_cb_mask) {
                sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
 
                if (sctx->chip_class <= GFX8)
-                       sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
+                       sctx->flags |= SI_CONTEXT_WB_L2;
        }
 
        /* Indirect buffers use TC L2 on GFX9, but not older hw. */
        if (sctx->screen->info.chip_class <= GFX8 &&
            flags & PIPE_BARRIER_INDIRECT_BUFFER)
-               sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
+               sctx->flags |= SI_CONTEXT_WB_L2;
 }
 
 static void *si_create_blend_custom(struct si_context *sctx, unsigned mode)
 {
        struct pipe_blend_state blend;
 
        memset(&blend, 0, sizeof(blend));
        blend.independent_blend_enable = true;
        blend.rt[0].colormask = 0xf;
        return si_create_blend_state_mode(&sctx->b, &blend, mode);
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c 
b/src/gallium/drivers/radeonsi/si_state_draw.c
index d780547659e..a81be533d64 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -952,24 +952,24 @@ void si_prim_discard_signal_next_compute_ib_start(struct 
si_context *sctx)
 }
 
 void si_emit_cache_flush(struct si_context *sctx)
 {
        struct radeon_cmdbuf *cs = sctx->gfx_cs;
        uint32_t flags = sctx->flags;
 
        if (!sctx->has_graphics) {
                /* Only process compute flags. */
                flags &= SI_CONTEXT_INV_ICACHE |
-                        SI_CONTEXT_INV_SMEM_L1 |
-                        SI_CONTEXT_INV_VMEM_L1 |
-                        SI_CONTEXT_INV_GLOBAL_L2 |
-                        SI_CONTEXT_WRITEBACK_GLOBAL_L2 |
+                        SI_CONTEXT_INV_SCACHE |
+                        SI_CONTEXT_INV_VCACHE |
+                        SI_CONTEXT_INV_L2 |
+                        SI_CONTEXT_WB_L2 |
                         SI_CONTEXT_INV_L2_METADATA |
                         SI_CONTEXT_CS_PARTIAL_FLUSH;
        }
 
        uint32_t cp_coher_cntl = 0;
        const uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB |
                                              SI_CONTEXT_FLUSH_AND_INV_DB);
        const bool is_barrier = flush_cb_db ||
                                /* INV_ICACHE == beginning of gfx IB. Checking
                                 * INV_ICACHE fixes corruption for DeusExMD with
@@ -989,21 +989,21 @@ void si_emit_cache_flush(struct si_context *sctx)
        /* GFX6 has a bug that it always flushes ICACHE and KCACHE if either
         * bit is set. An alternative way is to write SQC_CACHES, but that
         * doesn't seem to work reliably. Since the bug doesn't affect
         * correctness (it only does more work than necessary) and
         * the performance impact is likely negligible, there is no plan
         * to add a workaround for it.
         */
 
        if (flags & SI_CONTEXT_INV_ICACHE)
                cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
-       if (flags & SI_CONTEXT_INV_SMEM_L1)
+       if (flags & SI_CONTEXT_INV_SCACHE)
                cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
 
        if (sctx->chip_class <= GFX8) {
                if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
                        cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) |
                                         S_0085F0_CB0_DEST_BASE_ENA(1) |
                                         S_0085F0_CB1_DEST_BASE_ENA(1) |
                                         S_0085F0_CB2_DEST_BASE_ENA(1) |
                                         S_0085F0_CB3_DEST_BASE_ENA(1) |
                                         S_0085F0_CB4_DEST_BASE_ENA(1) |
@@ -1107,29 +1107,29 @@ void si_emit_cache_flush(struct si_context *sctx)
                 * TCL1                  = invalidate L1
                 */
                tc_flags = 0;
 
                if (flags & SI_CONTEXT_INV_L2_METADATA) {
                        tc_flags = EVENT_TC_ACTION_ENA |
                                   EVENT_TC_MD_ACTION_ENA;
                }
 
                /* Ideally flush TC together with CB/DB. */
-               if (flags & SI_CONTEXT_INV_GLOBAL_L2) {
+               if (flags & SI_CONTEXT_INV_L2) {
                        /* Writeback and invalidate everything in L2 & L1. */
                        tc_flags = EVENT_TC_ACTION_ENA |
                                   EVENT_TC_WB_ACTION_ENA;
 
                        /* Clear the flags. */
-                       flags &= ~(SI_CONTEXT_INV_GLOBAL_L2 |
-                                  SI_CONTEXT_WRITEBACK_GLOBAL_L2 |
-                                  SI_CONTEXT_INV_VMEM_L1);
+                       flags &= ~(SI_CONTEXT_INV_L2 |
+                                  SI_CONTEXT_WB_L2 |
+                                  SI_CONTEXT_INV_VCACHE);
                        sctx->num_L2_invalidates++;
                }
 
                /* Do the flush (enqueue the event and wait for it). */
                va = sctx->wait_mem_scratch->gpu_address;
                sctx->wait_mem_number++;
 
                si_cp_release_mem(sctx, cs, cb_db_event, tc_flags,
                                  EOP_DST_SEL_MEM,
                                  EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
@@ -1139,66 +1139,66 @@ void si_emit_cache_flush(struct si_context *sctx)
                si_cp_wait_mem(sctx, cs, va, sctx->wait_mem_number, 0xffffffff,
                               WAIT_REG_MEM_EQUAL);
        }
 
        /* Make sure ME is idle (it executes most packets) before continuing.
         * This prevents read-after-write hazards between PFP and ME.
         */
        if (sctx->has_graphics &&
            (cp_coher_cntl ||
             (flags & (SI_CONTEXT_CS_PARTIAL_FLUSH |
-                      SI_CONTEXT_INV_VMEM_L1 |
-                      SI_CONTEXT_INV_GLOBAL_L2 |
-                      SI_CONTEXT_WRITEBACK_GLOBAL_L2)))) {
+                      SI_CONTEXT_INV_VCACHE |
+                      SI_CONTEXT_INV_L2 |
+                      SI_CONTEXT_WB_L2)))) {
                radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
                radeon_emit(cs, 0);
        }
 
        /* GFX6-GFX8 only:
         *   When one of the CP_COHER_CNTL.DEST_BASE flags is set, SURFACE_SYNC
         *   waits for idle, so it should be last. SURFACE_SYNC is done in PFP.
         *
         * cp_coher_cntl should contain all necessary flags except TC flags
         * at this point.
         *
         * GFX6-GFX7 don't support L2 write-back.
         */
-       if (flags & SI_CONTEXT_INV_GLOBAL_L2 ||
+       if (flags & SI_CONTEXT_INV_L2 ||
            (sctx->chip_class <= GFX7 &&
-            (flags & SI_CONTEXT_WRITEBACK_GLOBAL_L2))) {
+            (flags & SI_CONTEXT_WB_L2))) {
                /* Invalidate L1 & L2. (L1 is always invalidated on GFX6)
                 * WB must be set on GFX8+ when TC_ACTION is set.
                 */
                si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl |
                                     S_0085F0_TC_ACTION_ENA(1) |
                                     S_0085F0_TCL1_ACTION_ENA(1) |
                                     S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class 
>= GFX8));
                cp_coher_cntl = 0;
                sctx->num_L2_invalidates++;
        } else {
                /* L1 invalidation and L2 writeback must be done separately,
                 * because both operations can't be done together.
                 */
-               if (flags & SI_CONTEXT_WRITEBACK_GLOBAL_L2) {
+               if (flags & SI_CONTEXT_WB_L2) {
                        /* WB = write-back
                         * NC = apply to non-coherent MTYPEs
                         *      (i.e. MTYPE <= 1, which is what we use 
everywhere)
                         *
                         * WB doesn't work without NC.
                         */
                        si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl |
                                             S_0301F0_TC_WB_ACTION_ENA(1) |
                                             S_0301F0_TC_NC_ACTION_ENA(1));
                        cp_coher_cntl = 0;
                        sctx->num_L2_writebacks++;
                }
-               if (flags & SI_CONTEXT_INV_VMEM_L1) {
+               if (flags & SI_CONTEXT_INV_VCACHE) {
                        /* Invalidate per-CU VMEM L1. */
                        si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl |
                                             S_0085F0_TCL1_ACTION_ENA(1));
                        cp_coher_cntl = 0;
                }
        }
 
        /* If TC flushes haven't cleared this... */
        if (cp_coher_cntl)
                si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl);
@@ -1581,46 +1581,46 @@ static void si_draw_vbo(struct pipe_context *ctx, const 
struct pipe_draw_info *i
                                      &index_offset, &indexbuf);
                        if (!indexbuf)
                                return;
 
                        /* info->start will be added by the drawing code */
                        index_offset -= start_offset;
                } else if (sctx->chip_class <= GFX7 &&
                           si_resource(indexbuf)->TC_L2_dirty) {
                        /* GFX8 reads index buffers through TC L2, so it doesn't
                         * need this. */
-                       sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
+                       sctx->flags |= SI_CONTEXT_WB_L2;
                        si_resource(indexbuf)->TC_L2_dirty = false;
                }
        }
 
        bool dispatch_prim_discard_cs = false;
        bool prim_discard_cs_instancing = false;
        unsigned original_index_size = index_size;
        unsigned direct_count = 0;
 
        if (info->indirect) {
                struct pipe_draw_indirect_info *indirect = info->indirect;
 
                /* Add the buffer size for memory checking in need_cs_space. */
                si_context_add_resource_size(sctx, indirect->buffer);
 
                /* Indirect buffers use TC L2 on GFX9, but not older hw. */
                if (sctx->chip_class <= GFX8) {
                        if (si_resource(indirect->buffer)->TC_L2_dirty) {
-                               sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
+                               sctx->flags |= SI_CONTEXT_WB_L2;
                                si_resource(indirect->buffer)->TC_L2_dirty = 
false;
                        }
 
                        if (indirect->indirect_draw_count &&
                            
si_resource(indirect->indirect_draw_count)->TC_L2_dirty) {
-                               sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
+                               sctx->flags |= SI_CONTEXT_WB_L2;
                                
si_resource(indirect->indirect_draw_count)->TC_L2_dirty = false;
                        }
                }
        } else {
                /* Multiply by 3 for strips and fans to get an approximate 
vertex
                 * count as triangles. */
                direct_count = info->count * instance_count *
                               (prim == PIPE_PRIM_TRIANGLES ? 1 : 3);
        }
 
diff --git a/src/gallium/drivers/radeonsi/si_state_streamout.c 
b/src/gallium/drivers/radeonsi/si_state_streamout.c
index e7058f19a8a..e3c72ccdf49 100644
--- a/src/gallium/drivers/radeonsi/si_state_streamout.c
+++ b/src/gallium/drivers/radeonsi/si_state_streamout.c
@@ -114,23 +114,23 @@ static void si_set_streamout_targets(struct pipe_context 
*ctx,
                /* Invalidate the scalar cache in case a streamout buffer is
                 * going to be used as a constant buffer.
                 *
                 * Invalidate vL1, because streamout bypasses it (done by
                 * setting GLC=1 in the store instruction), but vL1 in other
                 * CUs can contain outdated data of streamout buffers.
                 *
                 * VS_PARTIAL_FLUSH is required if the buffers are going to be
                 * used as an input immediately.
                 */
-               sctx->flags |= SI_CONTEXT_INV_SMEM_L1 |
-                                SI_CONTEXT_INV_VMEM_L1 |
-                                SI_CONTEXT_VS_PARTIAL_FLUSH;
+               sctx->flags |= SI_CONTEXT_INV_SCACHE |
+                              SI_CONTEXT_INV_VCACHE |
+                              SI_CONTEXT_VS_PARTIAL_FLUSH;
        }
 
        /* All readers of the streamout targets need to be finished before we 
can
         * start writing to the targets.
         */
        if (num_targets)
                sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
                                 SI_CONTEXT_CS_PARTIAL_FLUSH;
 
        /* Streamout buffers must be bound in 2 places:
diff --git a/src/gallium/drivers/radeonsi/si_test_dma_perf.c 
b/src/gallium/drivers/radeonsi/si_test_dma_perf.c
index 0b5a4a38ab7..0a0b9c4a657 100644
--- a/src/gallium/drivers/radeonsi/si_test_dma_perf.c
+++ b/src/gallium/drivers/radeonsi/si_test_dma_perf.c
@@ -226,40 +226,40 @@ void si_test_dma_perf(struct si_screen *sscreen)
                                                sb[0].buffer_size = size;
 
                                                if (is_copy) {
                                                        sb[1].buffer = src;
                                                        sb[1].buffer_size = 
size;
                                                } else {
                                                        for (unsigned i = 0; i 
< 4; i++)
                                                                
sctx->cs_user_data[i] = clear_value;
                                                }
 
-                                               sctx->flags |= 
SI_CONTEXT_INV_VMEM_L1 |
-                                                              
SI_CONTEXT_INV_SMEM_L1;
+                                               sctx->flags |= 
SI_CONTEXT_INV_VCACHE |
+                                                              
SI_CONTEXT_INV_SCACHE;
 
                                                ctx->set_shader_buffers(ctx, 
PIPE_SHADER_COMPUTE, 0,
                                                                        is_copy 
? 2 : 1, sb, 0x1);
                                                ctx->bind_compute_state(ctx, 
cs);
                                                sctx->cs_max_waves_per_sh = 
cs_waves_per_sh;
 
                                                ctx->launch_grid(ctx, &info);
 
                                                ctx->bind_compute_state(ctx, 
NULL);
                                                ctx->delete_compute_state(ctx, 
cs);
                                                sctx->cs_max_waves_per_sh = 0; 
/* disable the limit */
 
                                                sctx->flags |= 
SI_CONTEXT_CS_PARTIAL_FLUSH;
                                        }
 
                                        /* Flush L2, so that we don't just test 
L2 cache performance. */
                                        if (!test_sdma) {
-                                               sctx->flags |= 
SI_CONTEXT_WRITEBACK_GLOBAL_L2;
+                                               sctx->flags |= SI_CONTEXT_WB_L2;
                                                si_emit_cache_flush(sctx);
                                        }
 
                                        ctx->end_query(ctx, q[iter]);
                                        ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
                                }
                                pipe_resource_reference(&dst, NULL);
                                pipe_resource_reference(&src, NULL);
 
                                /* Get results. */
-- 
2.17.1

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to