From: Marek Olšák <marek.ol...@amd.com> SMEM and VMEM caches are L0 on gfx10. --- src/gallium/drivers/radeonsi/si_compute.c | 2 +- .../drivers/radeonsi/si_compute_blit.c | 12 +++--- src/gallium/drivers/radeonsi/si_descriptors.c | 2 +- src/gallium/drivers/radeonsi/si_gfx_cs.c | 8 ++-- src/gallium/drivers/radeonsi/si_pipe.c | 8 ++-- src/gallium/drivers/radeonsi/si_pipe.h | 34 +++++++++-------- src/gallium/drivers/radeonsi/si_state.c | 14 +++---- src/gallium/drivers/radeonsi/si_state_draw.c | 38 +++++++++---------- .../drivers/radeonsi/si_state_streamout.c | 6 +-- .../drivers/radeonsi/si_test_dma_perf.c | 6 +-- 10 files changed, 66 insertions(+), 64 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 7e5259b70a0..63c95ed2604 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -910,21 +910,21 @@ static void si_launch_grid( /* Add buffer sizes for memory checking in need_cs_space. */ si_context_add_resource_size(sctx, &program->shader.bo->b.b); /* TODO: add the scratch buffer */ if (info->indirect) { si_context_add_resource_size(sctx, info->indirect); /* Indirect buffers use TC L2 on GFX9, but not older hw. */ if (sctx->chip_class <= GFX8 && si_resource(info->indirect)->TC_L2_dirty) { - sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2; + sctx->flags |= SI_CONTEXT_WB_L2; si_resource(info->indirect)->TC_L2_dirty = false; } } si_need_gfx_cs_space(sctx); if (sctx->bo_list_add_all_compute_resources) si_compute_resources_add_all_to_bo_list(sctx); if (!sctx->cs_shader_state.initialized) { diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c index 1cfdc9b62c6..4c5464ac118 100644 --- a/src/gallium/drivers/radeonsi/si_compute_blit.c +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c @@ -44,23 +44,23 @@ static enum si_cache_policy get_cache_policy(struct si_context *sctx, unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher, enum si_cache_policy cache_policy) { switch (coher) { default: case SI_COHERENCY_NONE: case SI_COHERENCY_CP: return 0; case SI_COHERENCY_SHADER: - return SI_CONTEXT_INV_SMEM_L1 | - SI_CONTEXT_INV_VMEM_L1 | - (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_GLOBAL_L2 : 0); + return SI_CONTEXT_INV_SCACHE | + SI_CONTEXT_INV_VCACHE | + (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_L2 : 0); case SI_COHERENCY_CB_META: return SI_CONTEXT_FLUSH_AND_INV_CB; } } static void si_compute_internal_begin(struct si_context *sctx) { sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS; sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS; sctx->render_cond_force_off = true; @@ -165,21 +165,21 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, SI_COMPUTE_CLEAR_DW_PER_THREAD, shader_dst_stream_policy, false); } ctx->bind_compute_state(ctx, sctx->cs_clear_buffer); } ctx->launch_grid(ctx, &info); enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size); sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | - (cache_policy == L2_BYPASS ? SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0); + (cache_policy == L2_BYPASS ? SI_CONTEXT_WB_L2 : 0); if (cache_policy != L2_BYPASS) si_resource(dst)->TC_L2_dirty = true; /* Restore states. */ ctx->bind_compute_state(ctx, saved_cs); ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb, saved_writable_mask); si_compute_internal_end(sctx); } @@ -411,21 +411,21 @@ void si_compute_copy_image(struct si_context *sctx, info.last_block[1] = height % 8; info.block[2] = 1; info.grid[0] = DIV_ROUND_UP(width, 8); info.grid[1] = DIV_ROUND_UP(height, 8); info.grid[2] = depth; } ctx->launch_grid(ctx, &info); sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | - (sctx->chip_class <= GFX8 ? SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0) | + (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) | si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM); ctx->bind_compute_state(ctx, saved_cs); ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, saved_image); ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb); si_compute_internal_end(sctx); } void si_retile_dcc(struct si_context *sctx, struct si_texture *tex) { struct pipe_context *ctx = &sctx->b; @@ -590,17 +590,17 @@ void si_compute_clear_render_target(struct pipe_context *ctx, info.block[1] = 1; info.block[2] = 1; info.grid[0] = DIV_ROUND_UP(width, 64); info.grid[1] = num_layers; info.grid[2] = 1; } ctx->launch_grid(ctx, &info); sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | - (sctx->chip_class <= GFX8 ? SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0) | + (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) | si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM); ctx->bind_compute_state(ctx, saved_cs); ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_image); ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb); si_compute_internal_end(sctx); } diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index 37d92fa7363..2a13ffd32f9 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -1874,21 +1874,21 @@ static void si_upload_bindless_descriptors(struct si_context *sctx) unsigned desc_slot = (*img_handle)->desc_slot; if (!(*img_handle)->desc_dirty) continue; si_upload_bindless_descriptor(sctx, desc_slot, 8); (*img_handle)->desc_dirty = false; } /* Invalidate L1 because it doesn't know that L2 changed. */ - sctx->flags |= SI_CONTEXT_INV_SMEM_L1; + sctx->flags |= SI_CONTEXT_INV_SCACHE; si_emit_cache_flush(sctx); sctx->bindless_descriptors_dirty = false; } /* Update mutable image descriptor fields of all resident textures. */ static void si_update_bindless_texture_descriptor(struct si_context *sctx, struct si_texture_handle *tex_handle) { struct si_sampler_view *sview = (struct si_sampler_view *)tex_handle->view; diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index de0909904c8..9386df3a615 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -76,21 +76,21 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct radeon_cmdbuf *cs = ctx->gfx_cs; struct radeon_winsys *ws = ctx->ws; unsigned wait_flags = 0; if (ctx->gfx_flush_in_progress) return; if (!ctx->screen->info.kernel_flushes_tc_l2_after_ib) { wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH | - SI_CONTEXT_INV_GLOBAL_L2; + SI_CONTEXT_INV_L2; } else if (ctx->chip_class == GFX6) { /* The kernel flushes L2 before shaders are finished. */ wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH; } else if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) { wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH; } /* Drop this flush if it's a no-op. */ @@ -297,23 +297,23 @@ void si_begin_new_gfx_cs(struct si_context *ctx) * users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our * buffers. * * Note that the cache flush done by the kernel at the end of GFX IBs * isn't useful here, because that flush can finish after the following * IB starts drawing. * * TODO: Do we also need to invalidate CB & DB caches? */ ctx->flags |= SI_CONTEXT_INV_ICACHE | - SI_CONTEXT_INV_SMEM_L1 | - SI_CONTEXT_INV_VMEM_L1 | - SI_CONTEXT_INV_GLOBAL_L2 | + SI_CONTEXT_INV_SCACHE | + SI_CONTEXT_INV_VCACHE | + SI_CONTEXT_INV_L2 | SI_CONTEXT_START_PIPELINE_STATS; ctx->cs_shader_state.initialized = false; si_all_descriptors_begin_new_cs(ctx); if (!ctx->has_graphics) { ctx->initial_gfx_cs_size = ctx->gfx_cs->current.cdw; return; } diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index d0d04bbb3de..31a9d92461f 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -1148,25 +1148,25 @@ radeonsi_screen_create_impl(struct radeon_winsys *ws, sscreen->dcc_msaa_allowed = !(sscreen->debug_flags & DBG(NO_DCC_MSAA)); sscreen->cpdma_prefetch_writes_memory = sscreen->info.chip_class <= GFX8; (void) mtx_init(&sscreen->shader_parts_mutex, mtx_plain); sscreen->use_monolithic_shaders = (sscreen->debug_flags & DBG(MONOLITHIC_SHADERS)) != 0; - sscreen->barrier_flags.cp_to_L2 = SI_CONTEXT_INV_SMEM_L1 | - SI_CONTEXT_INV_VMEM_L1; + sscreen->barrier_flags.cp_to_L2 = SI_CONTEXT_INV_SCACHE | + SI_CONTEXT_INV_VCACHE; if (sscreen->info.chip_class <= GFX8) { - sscreen->barrier_flags.cp_to_L2 |= SI_CONTEXT_INV_GLOBAL_L2; - sscreen->barrier_flags.L2_to_cp |= SI_CONTEXT_WRITEBACK_GLOBAL_L2; + sscreen->barrier_flags.cp_to_L2 |= SI_CONTEXT_INV_L2; + sscreen->barrier_flags.L2_to_cp |= SI_CONTEXT_WB_L2; } if (debug_get_bool_option("RADEON_DUMP_SHADERS", false)) sscreen->debug_flags |= DBG_ALL_SHADERS; /* Syntax: * EQAA=s,z,c * Example: * EQAA=8,4,2 diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 368cb4e473d..11678e1b4cb 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -58,30 +58,32 @@ #define SI_COMPUTE_CLEAR_DW_PER_THREAD 4 #define SI_COMPUTE_COPY_DW_PER_THREAD 4 #define SI_COMPUTE_DST_CACHE_POLICY L2_STREAM /* Pipeline & streamout query controls. */ #define SI_CONTEXT_START_PIPELINE_STATS (1 << 0) #define SI_CONTEXT_STOP_PIPELINE_STATS (1 << 1) #define SI_CONTEXT_FLUSH_FOR_RENDER_COND (1 << 2) /* Instruction cache. */ #define SI_CONTEXT_INV_ICACHE (1 << 3) -/* SMEM L1, other names: KCACHE, constant cache, DCACHE, data cache */ -#define SI_CONTEXT_INV_SMEM_L1 (1 << 4) -/* VMEM L1 can optionally be bypassed (GLC=1). Other names: TC L1 */ -#define SI_CONTEXT_INV_VMEM_L1 (1 << 5) -/* Used by everything except CB/DB, can be bypassed (SLC=1). Other names: TC L2 */ -#define SI_CONTEXT_INV_GLOBAL_L2 (1 << 6) -/* Write dirty L2 lines back to memory (shader and CP DMA stores), but don't - * invalidate L2. GFX6-GFX7 can't do it, so they will do complete invalidation. */ -#define SI_CONTEXT_WRITEBACK_GLOBAL_L2 (1 << 7) -/* Writeback & invalidate the L2 metadata cache. It can only be coupled with +/* Scalar L1 cache. */ +#define SI_CONTEXT_INV_SCACHE (1 << 4) +/* Vector L1 cache. */ +#define SI_CONTEXT_INV_VCACHE (1 << 5) +/* L2 cache + L2 metadata cache writeback & invalidate. + * GFX6-8: Used by shaders only. GFX9-10: Used by everything. */ +#define SI_CONTEXT_INV_L2 (1 << 6) +/* L2 writeback (write dirty L2 lines to memory for non-L2 clients). + * Only used for coherency with non-L2 clients like CB, DB, CP on GFX6-8. + * GFX6-7 will do complete invalidation, because the writeback is unsupported. */ +#define SI_CONTEXT_WB_L2 (1 << 7) +/* Writeback & invalidate the L2 metadata cache only. It can only be coupled with * a CB or DB flush. */ #define SI_CONTEXT_INV_L2_METADATA (1 << 8) /* Framebuffer caches. */ #define SI_CONTEXT_FLUSH_AND_INV_DB (1 << 9) #define SI_CONTEXT_FLUSH_AND_INV_DB_META (1 << 10) #define SI_CONTEXT_FLUSH_AND_INV_CB (1 << 11) /* Engine synchronization. */ #define SI_CONTEXT_VS_PARTIAL_FLUSH (1 << 12) #define SI_CONTEXT_PS_PARTIAL_FLUSH (1 << 13) #define SI_CONTEXT_CS_PARTIAL_FLUSH (1 << 14) @@ -1639,57 +1641,57 @@ si_saved_cs_reference(struct si_saved_cs **dst, struct si_saved_cs *src) si_destroy_saved_cs(*dst); *dst = src; } static inline void si_make_CB_shader_coherent(struct si_context *sctx, unsigned num_samples, bool shaders_read_metadata, bool dcc_pipe_aligned) { sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB | - SI_CONTEXT_INV_VMEM_L1; + SI_CONTEXT_INV_VCACHE; if (sctx->chip_class >= GFX9) { /* Single-sample color is coherent with shaders on GFX9, but * L2 metadata must be flushed if shaders read metadata. * (DCC, CMASK). */ if (num_samples >= 2 || (shaders_read_metadata && !dcc_pipe_aligned)) - sctx->flags |= SI_CONTEXT_INV_GLOBAL_L2; + sctx->flags |= SI_CONTEXT_INV_L2; else if (shaders_read_metadata) sctx->flags |= SI_CONTEXT_INV_L2_METADATA; } else { /* GFX6-GFX8 */ - sctx->flags |= SI_CONTEXT_INV_GLOBAL_L2; + sctx->flags |= SI_CONTEXT_INV_L2; } } static inline void si_make_DB_shader_coherent(struct si_context *sctx, unsigned num_samples, bool include_stencil, bool shaders_read_metadata) { sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB | - SI_CONTEXT_INV_VMEM_L1; + SI_CONTEXT_INV_VCACHE; if (sctx->chip_class >= GFX9) { /* Single-sample depth (not stencil) is coherent with shaders * on GFX9, but L2 metadata must be flushed if shaders read * metadata. */ if (num_samples >= 2 || include_stencil) - sctx->flags |= SI_CONTEXT_INV_GLOBAL_L2; + sctx->flags |= SI_CONTEXT_INV_L2; else if (shaders_read_metadata) sctx->flags |= SI_CONTEXT_INV_L2_METADATA; } else { /* GFX6-GFX8 */ - sctx->flags |= SI_CONTEXT_INV_GLOBAL_L2; + sctx->flags |= SI_CONTEXT_INV_L2; } } static inline bool si_can_sample_zs(struct si_texture *tex, bool stencil_sampler) { return (stencil_sampler && tex->can_sample_s) || (!stencil_sampler && tex->can_sample_z); } diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index af21914a142..b9fc77f7918 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -4785,61 +4785,61 @@ static void si_texture_barrier(struct pipe_context *ctx, unsigned flags) static void si_memory_barrier(struct pipe_context *ctx, unsigned flags) { struct si_context *sctx = (struct si_context *)ctx; if (!(flags & ~PIPE_BARRIER_UPDATE)) return; /* Subsequent commands must wait for all shader invocations to * complete. */ sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | - SI_CONTEXT_CS_PARTIAL_FLUSH; + SI_CONTEXT_CS_PARTIAL_FLUSH; if (flags & PIPE_BARRIER_CONSTANT_BUFFER) - sctx->flags |= SI_CONTEXT_INV_SMEM_L1 | - SI_CONTEXT_INV_VMEM_L1; + sctx->flags |= SI_CONTEXT_INV_SCACHE | + SI_CONTEXT_INV_VCACHE; if (flags & (PIPE_BARRIER_VERTEX_BUFFER | PIPE_BARRIER_SHADER_BUFFER | PIPE_BARRIER_TEXTURE | PIPE_BARRIER_IMAGE | PIPE_BARRIER_STREAMOUT_BUFFER | PIPE_BARRIER_GLOBAL_BUFFER)) { /* As far as I can tell, L1 contents are written back to L2 * automatically at end of shader, but the contents of other * L1 caches might still be stale. */ - sctx->flags |= SI_CONTEXT_INV_VMEM_L1; + sctx->flags |= SI_CONTEXT_INV_VCACHE; } if (flags & PIPE_BARRIER_INDEX_BUFFER) { /* Indices are read through TC L2 since GFX8. * L1 isn't used. */ if (sctx->screen->info.chip_class <= GFX7) - sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2; + sctx->flags |= SI_CONTEXT_WB_L2; } /* MSAA color, any depth and any stencil are flushed in * si_decompress_textures when needed. */ if (flags & PIPE_BARRIER_FRAMEBUFFER && sctx->framebuffer.uncompressed_cb_mask) { sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB; if (sctx->chip_class <= GFX8) - sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2; + sctx->flags |= SI_CONTEXT_WB_L2; } /* Indirect buffers use TC L2 on GFX9, but not older hw. */ if (sctx->screen->info.chip_class <= GFX8 && flags & PIPE_BARRIER_INDIRECT_BUFFER) - sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2; + sctx->flags |= SI_CONTEXT_WB_L2; } static void *si_create_blend_custom(struct si_context *sctx, unsigned mode) { struct pipe_blend_state blend; memset(&blend, 0, sizeof(blend)); blend.independent_blend_enable = true; blend.rt[0].colormask = 0xf; return si_create_blend_state_mode(&sctx->b, &blend, mode); diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index d780547659e..a81be533d64 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -952,24 +952,24 @@ void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx) } void si_emit_cache_flush(struct si_context *sctx) { struct radeon_cmdbuf *cs = sctx->gfx_cs; uint32_t flags = sctx->flags; if (!sctx->has_graphics) { /* Only process compute flags. */ flags &= SI_CONTEXT_INV_ICACHE | - SI_CONTEXT_INV_SMEM_L1 | - SI_CONTEXT_INV_VMEM_L1 | - SI_CONTEXT_INV_GLOBAL_L2 | - SI_CONTEXT_WRITEBACK_GLOBAL_L2 | + SI_CONTEXT_INV_SCACHE | + SI_CONTEXT_INV_VCACHE | + SI_CONTEXT_INV_L2 | + SI_CONTEXT_WB_L2 | SI_CONTEXT_INV_L2_METADATA | SI_CONTEXT_CS_PARTIAL_FLUSH; } uint32_t cp_coher_cntl = 0; const uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB); const bool is_barrier = flush_cb_db || /* INV_ICACHE == beginning of gfx IB. Checking * INV_ICACHE fixes corruption for DeusExMD with @@ -989,21 +989,21 @@ void si_emit_cache_flush(struct si_context *sctx) /* GFX6 has a bug that it always flushes ICACHE and KCACHE if either * bit is set. An alternative way is to write SQC_CACHES, but that * doesn't seem to work reliably. Since the bug doesn't affect * correctness (it only does more work than necessary) and * the performance impact is likely negligible, there is no plan * to add a workaround for it. */ if (flags & SI_CONTEXT_INV_ICACHE) cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1); - if (flags & SI_CONTEXT_INV_SMEM_L1) + if (flags & SI_CONTEXT_INV_SCACHE) cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1); if (sctx->chip_class <= GFX8) { if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) { cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) | S_0085F0_CB0_DEST_BASE_ENA(1) | S_0085F0_CB1_DEST_BASE_ENA(1) | S_0085F0_CB2_DEST_BASE_ENA(1) | S_0085F0_CB3_DEST_BASE_ENA(1) | S_0085F0_CB4_DEST_BASE_ENA(1) | @@ -1107,29 +1107,29 @@ void si_emit_cache_flush(struct si_context *sctx) * TCL1 = invalidate L1 */ tc_flags = 0; if (flags & SI_CONTEXT_INV_L2_METADATA) { tc_flags = EVENT_TC_ACTION_ENA | EVENT_TC_MD_ACTION_ENA; } /* Ideally flush TC together with CB/DB. */ - if (flags & SI_CONTEXT_INV_GLOBAL_L2) { + if (flags & SI_CONTEXT_INV_L2) { /* Writeback and invalidate everything in L2 & L1. */ tc_flags = EVENT_TC_ACTION_ENA | EVENT_TC_WB_ACTION_ENA; /* Clear the flags. */ - flags &= ~(SI_CONTEXT_INV_GLOBAL_L2 | - SI_CONTEXT_WRITEBACK_GLOBAL_L2 | - SI_CONTEXT_INV_VMEM_L1); + flags &= ~(SI_CONTEXT_INV_L2 | + SI_CONTEXT_WB_L2 | + SI_CONTEXT_INV_VCACHE); sctx->num_L2_invalidates++; } /* Do the flush (enqueue the event and wait for it). */ va = sctx->wait_mem_scratch->gpu_address; sctx->wait_mem_number++; si_cp_release_mem(sctx, cs, cb_db_event, tc_flags, EOP_DST_SEL_MEM, EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, @@ -1139,66 +1139,66 @@ void si_emit_cache_flush(struct si_context *sctx) si_cp_wait_mem(sctx, cs, va, sctx->wait_mem_number, 0xffffffff, WAIT_REG_MEM_EQUAL); } /* Make sure ME is idle (it executes most packets) before continuing. * This prevents read-after-write hazards between PFP and ME. */ if (sctx->has_graphics && (cp_coher_cntl || (flags & (SI_CONTEXT_CS_PARTIAL_FLUSH | - SI_CONTEXT_INV_VMEM_L1 | - SI_CONTEXT_INV_GLOBAL_L2 | - SI_CONTEXT_WRITEBACK_GLOBAL_L2)))) { + SI_CONTEXT_INV_VCACHE | + SI_CONTEXT_INV_L2 | + SI_CONTEXT_WB_L2)))) { radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); radeon_emit(cs, 0); } /* GFX6-GFX8 only: * When one of the CP_COHER_CNTL.DEST_BASE flags is set, SURFACE_SYNC * waits for idle, so it should be last. SURFACE_SYNC is done in PFP. * * cp_coher_cntl should contain all necessary flags except TC flags * at this point. * * GFX6-GFX7 don't support L2 write-back. */ - if (flags & SI_CONTEXT_INV_GLOBAL_L2 || + if (flags & SI_CONTEXT_INV_L2 || (sctx->chip_class <= GFX7 && - (flags & SI_CONTEXT_WRITEBACK_GLOBAL_L2))) { + (flags & SI_CONTEXT_WB_L2))) { /* Invalidate L1 & L2. (L1 is always invalidated on GFX6) * WB must be set on GFX8+ when TC_ACTION is set. */ si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl | S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) | S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8)); cp_coher_cntl = 0; sctx->num_L2_invalidates++; } else { /* L1 invalidation and L2 writeback must be done separately, * because both operations can't be done together. */ - if (flags & SI_CONTEXT_WRITEBACK_GLOBAL_L2) { + if (flags & SI_CONTEXT_WB_L2) { /* WB = write-back * NC = apply to non-coherent MTYPEs * (i.e. MTYPE <= 1, which is what we use everywhere) * * WB doesn't work without NC. */ si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl | S_0301F0_TC_WB_ACTION_ENA(1) | S_0301F0_TC_NC_ACTION_ENA(1)); cp_coher_cntl = 0; sctx->num_L2_writebacks++; } - if (flags & SI_CONTEXT_INV_VMEM_L1) { + if (flags & SI_CONTEXT_INV_VCACHE) { /* Invalidate per-CU VMEM L1. */ si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl | S_0085F0_TCL1_ACTION_ENA(1)); cp_coher_cntl = 0; } } /* If TC flushes haven't cleared this... */ if (cp_coher_cntl) si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl); @@ -1581,46 +1581,46 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i &index_offset, &indexbuf); if (!indexbuf) return; /* info->start will be added by the drawing code */ index_offset -= start_offset; } else if (sctx->chip_class <= GFX7 && si_resource(indexbuf)->TC_L2_dirty) { /* GFX8 reads index buffers through TC L2, so it doesn't * need this. */ - sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2; + sctx->flags |= SI_CONTEXT_WB_L2; si_resource(indexbuf)->TC_L2_dirty = false; } } bool dispatch_prim_discard_cs = false; bool prim_discard_cs_instancing = false; unsigned original_index_size = index_size; unsigned direct_count = 0; if (info->indirect) { struct pipe_draw_indirect_info *indirect = info->indirect; /* Add the buffer size for memory checking in need_cs_space. */ si_context_add_resource_size(sctx, indirect->buffer); /* Indirect buffers use TC L2 on GFX9, but not older hw. */ if (sctx->chip_class <= GFX8) { if (si_resource(indirect->buffer)->TC_L2_dirty) { - sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2; + sctx->flags |= SI_CONTEXT_WB_L2; si_resource(indirect->buffer)->TC_L2_dirty = false; } if (indirect->indirect_draw_count && si_resource(indirect->indirect_draw_count)->TC_L2_dirty) { - sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2; + sctx->flags |= SI_CONTEXT_WB_L2; si_resource(indirect->indirect_draw_count)->TC_L2_dirty = false; } } } else { /* Multiply by 3 for strips and fans to get an approximate vertex * count as triangles. */ direct_count = info->count * instance_count * (prim == PIPE_PRIM_TRIANGLES ? 1 : 3); } diff --git a/src/gallium/drivers/radeonsi/si_state_streamout.c b/src/gallium/drivers/radeonsi/si_state_streamout.c index e7058f19a8a..e3c72ccdf49 100644 --- a/src/gallium/drivers/radeonsi/si_state_streamout.c +++ b/src/gallium/drivers/radeonsi/si_state_streamout.c @@ -114,23 +114,23 @@ static void si_set_streamout_targets(struct pipe_context *ctx, /* Invalidate the scalar cache in case a streamout buffer is * going to be used as a constant buffer. * * Invalidate vL1, because streamout bypasses it (done by * setting GLC=1 in the store instruction), but vL1 in other * CUs can contain outdated data of streamout buffers. * * VS_PARTIAL_FLUSH is required if the buffers are going to be * used as an input immediately. */ - sctx->flags |= SI_CONTEXT_INV_SMEM_L1 | - SI_CONTEXT_INV_VMEM_L1 | - SI_CONTEXT_VS_PARTIAL_FLUSH; + sctx->flags |= SI_CONTEXT_INV_SCACHE | + SI_CONTEXT_INV_VCACHE | + SI_CONTEXT_VS_PARTIAL_FLUSH; } /* All readers of the streamout targets need to be finished before we can * start writing to the targets. */ if (num_targets) sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH; /* Streamout buffers must be bound in 2 places: diff --git a/src/gallium/drivers/radeonsi/si_test_dma_perf.c b/src/gallium/drivers/radeonsi/si_test_dma_perf.c index 0b5a4a38ab7..0a0b9c4a657 100644 --- a/src/gallium/drivers/radeonsi/si_test_dma_perf.c +++ b/src/gallium/drivers/radeonsi/si_test_dma_perf.c @@ -226,40 +226,40 @@ void si_test_dma_perf(struct si_screen *sscreen) sb[0].buffer_size = size; if (is_copy) { sb[1].buffer = src; sb[1].buffer_size = size; } else { for (unsigned i = 0; i < 4; i++) sctx->cs_user_data[i] = clear_value; } - sctx->flags |= SI_CONTEXT_INV_VMEM_L1 | - SI_CONTEXT_INV_SMEM_L1; + sctx->flags |= SI_CONTEXT_INV_VCACHE | + SI_CONTEXT_INV_SCACHE; ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, is_copy ? 2 : 1, sb, 0x1); ctx->bind_compute_state(ctx, cs); sctx->cs_max_waves_per_sh = cs_waves_per_sh; ctx->launch_grid(ctx, &info); ctx->bind_compute_state(ctx, NULL); ctx->delete_compute_state(ctx, cs); sctx->cs_max_waves_per_sh = 0; /* disable the limit */ sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH; } /* Flush L2, so that we don't just test L2 cache performance. */ if (!test_sdma) { - sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2; + sctx->flags |= SI_CONTEXT_WB_L2; si_emit_cache_flush(sctx); } ctx->end_query(ctx, q[iter]); ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC); } pipe_resource_reference(&dst, NULL); pipe_resource_reference(&src, NULL); /* Get results. */ -- 2.17.1 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev