From: Marek Olšák <marek.ol...@amd.com> --- src/gallium/drivers/radeonsi/si_blit.c | 8 +++++--- src/gallium/drivers/radeonsi/si_pipe.h | 23 +++++++++++++++++++---- src/gallium/drivers/radeonsi/si_state.c | 19 +++++++++++++++---- src/gallium/drivers/radeonsi/si_state_draw.c | 11 +++++------ 4 files changed, 44 insertions(+), 17 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c index ae7f809..3228933 100644 --- a/src/gallium/drivers/radeonsi/si_blit.c +++ b/src/gallium/drivers/radeonsi/si_blit.c @@ -399,21 +399,22 @@ si_decompress_depth(struct si_context *sctx, if (inplace_planes & PIPE_MASK_Z) tex->dirty_level_mask = 0; if (inplace_planes & PIPE_MASK_S) tex->stencil_dirty_level_mask = 0; } } /* set_framebuffer_state takes care of coherency for single-sample. * The DB->CB copy uses CB for the final writes. */ if (copy_planes && tex->resource.b.b.nr_samples > 1) - si_make_CB_shader_coherent(sctx, tex->resource.b.b.nr_samples); + si_make_CB_shader_coherent(sctx, tex->resource.b.b.nr_samples, + false); } static void si_decompress_sampler_depth_textures(struct si_context *sctx, struct si_textures_info *textures) { unsigned i; unsigned mask = textures->needs_depth_decompress_mask; while (mask) { @@ -504,21 +505,22 @@ static void si_blit_decompress_color(struct pipe_context *ctx, } /* The texture will always be dirty if some layers aren't flushed. * I don't think this case occurs often though. */ if (first_layer == 0 && last_layer >= max_layer) { rtex->dirty_level_mask &= ~(1 << level); } } sctx->decompression_enabled = false; - si_make_CB_shader_coherent(sctx, rtex->resource.b.b.nr_samples); + si_make_CB_shader_coherent(sctx, rtex->resource.b.b.nr_samples, + vi_dcc_enabled(rtex, first_level)); } static void si_decompress_color_texture(struct si_context *sctx, struct r600_texture *tex, unsigned first_level, unsigned last_level) { /* CMASK or DCC can be discarded and we can still end up here. */ if (!tex->cmask.size && !tex->fmask.size && !tex->dcc_offset) return; @@ -1193,21 +1195,21 @@ static void si_do_CB_resolve(struct si_context *sctx, si_blitter_begin(&sctx->b.b, SI_COLOR_RESOLVE | (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND)); util_blitter_custom_resolve_color(sctx->blitter, dst, dst_level, dst_z, info->src.resource, info->src.box.z, ~0, sctx->custom_blend_resolve, format); si_blitter_end(&sctx->b.b); /* Flush caches for possible texturing. */ - si_make_CB_shader_coherent(sctx, 1); + si_make_CB_shader_coherent(sctx, 1, false); } static bool do_hardware_msaa_resolve(struct pipe_context *ctx, const struct pipe_blit_info *info) { struct si_context *sctx = (struct si_context*)ctx; struct r600_texture *src = (struct r600_texture*)info->src.resource; struct r600_texture *dst = (struct r600_texture*)info->dst.resource; MAYBE_UNUSED struct r600_texture *rtmp; unsigned dst_width = u_minify(info->dst.resource->width0, info->dst.level); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 671c488..3e59e21 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -50,21 +50,24 @@ #define SI_CONTEXT_INV_ICACHE (R600_CONTEXT_PRIVATE_FLAG << 0) /* SMEM L1, other names: KCACHE, constant cache, DCACHE, data cache */ #define SI_CONTEXT_INV_SMEM_L1 (R600_CONTEXT_PRIVATE_FLAG << 1) /* VMEM L1 can optionally be bypassed (GLC=1). Other names: TC L1 */ #define SI_CONTEXT_INV_VMEM_L1 (R600_CONTEXT_PRIVATE_FLAG << 2) /* Used by everything except CB/DB, can be bypassed (SLC=1). Other names: TC L2 */ #define SI_CONTEXT_INV_GLOBAL_L2 (R600_CONTEXT_PRIVATE_FLAG << 3) /* Write dirty L2 lines back to memory (shader and CP DMA stores), but don't * invalidate L2. SI-CIK can't do it, so they will do complete invalidation. */ #define SI_CONTEXT_WRITEBACK_GLOBAL_L2 (R600_CONTEXT_PRIVATE_FLAG << 4) -/* gaps */ +/* Writeback & invalidate the L2 metadata cache. It can only be coupled with + * a CB or DB flush. */ +#define SI_CONTEXT_INV_L2_METADATA (R600_CONTEXT_PRIVATE_FLAG << 5) +/* gap */ /* Framebuffer caches. */ #define SI_CONTEXT_FLUSH_AND_INV_DB (R600_CONTEXT_PRIVATE_FLAG << 7) #define SI_CONTEXT_FLUSH_AND_INV_CB (R600_CONTEXT_PRIVATE_FLAG << 8) /* Engine synchronization. */ #define SI_CONTEXT_VS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 9) #define SI_CONTEXT_PS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 10) #define SI_CONTEXT_CS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 11) #define SI_CONTEXT_VGT_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 12) #define SI_CONTEXT_VGT_STREAMOUT_SYNC (R600_CONTEXT_PRIVATE_FLAG << 13) @@ -190,20 +193,21 @@ struct si_framebuffer { unsigned spi_shader_col_format_blend; unsigned spi_shader_col_format_blend_alpha; ubyte nr_samples:5; /* at most 16xAA */ ubyte log_samples:3; /* at most 4 = 16xAA */ ubyte compressed_cb_mask; ubyte color_is_int8; ubyte color_is_int10; ubyte dirty_cbufs; bool dirty_zsbuf; bool any_dst_linear; + bool CB_has_shader_readable_metadata; }; struct si_clip_state { struct r600_atom atom; struct pipe_clip_state state; bool any_nonzeros; }; struct si_sample_locs { struct r600_atom atom; @@ -588,28 +592,39 @@ si_optimal_tcc_alignment(struct si_context *sctx, unsigned upload_size) * the whole thing will fit into a cache line if we align it to its size. * The idea is that multiple small uploads can share a cache line. * If the upload size is greater, align it to the cache line size. */ alignment = util_next_power_of_two(upload_size); tcc_cache_line_size = sctx->screen->b.info.tcc_cache_line_size; return MIN2(alignment, tcc_cache_line_size); } static inline void -si_make_CB_shader_coherent(struct si_context *sctx, unsigned num_samples) +si_make_CB_shader_coherent(struct si_context *sctx, unsigned num_samples, + bool shaders_read_metadata) { sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_INV_VMEM_L1; - /* Single-sample color is coherent with shaders on GFX9. */ - if (sctx->b.chip_class <= VI || num_samples >= 2) + if (sctx->b.chip_class >= GFX9) { + /* Single-sample color is coherent with shaders on GFX9, but + * L2 metadata must be flushed if shaders read metadata. + * (DCC, CMASK). + */ + if (num_samples >= 2) + sctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2; + else if (shaders_read_metadata) + sctx->b.flags |= SI_CONTEXT_INV_L2_METADATA; + } else { + /* SI-CI-VI */ sctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2; + } } static inline void si_make_DB_shader_coherent(struct si_context *sctx, unsigned num_samples, bool include_stencil) { sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_DB | SI_CONTEXT_INV_VMEM_L1; /* Single-sample depth (not stencil) is coherent with shaders on GFX9. */ diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index d116c07..e5d8d21 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -2566,21 +2566,22 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, * * When MSAA is enabled, CB and TC caches are flushed on demand * (after FMASK decompression). Shader write -> FB read transitions * cannot happen for MSAA textures, because MSAA shader images are * not supported. * * Only flush and wait for CB if there is actually a bound color buffer. */ if (sctx->framebuffer.nr_samples <= 1 && sctx->framebuffer.state.nr_cbufs) - si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples); + si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples, + sctx->framebuffer.CB_has_shader_readable_metadata); sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH; /* u_blitter doesn't invoke depth decompression when it does multiple * blits in a row, but the only case when it matters for DB is when * doing generate_mipmap. So here we flush DB manually between * individual generate_mipmap blits. * Note that lower mipmap levels aren't compressed. */ if (sctx->generate_mipmap_for_depth) @@ -2601,20 +2602,21 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, sctx->framebuffer.spi_shader_col_format_alpha = 0; sctx->framebuffer.spi_shader_col_format_blend = 0; sctx->framebuffer.spi_shader_col_format_blend_alpha = 0; sctx->framebuffer.color_is_int8 = 0; sctx->framebuffer.color_is_int10 = 0; sctx->framebuffer.compressed_cb_mask = 0; sctx->framebuffer.nr_samples = util_framebuffer_get_num_samples(state); sctx->framebuffer.log_samples = util_logbase2(sctx->framebuffer.nr_samples); sctx->framebuffer.any_dst_linear = false; + sctx->framebuffer.CB_has_shader_readable_metadata = false; for (i = 0; i < state->nr_cbufs; i++) { if (!state->cbufs[i]) continue; surf = (struct r600_surface*)state->cbufs[i]; rtex = (struct r600_texture*)surf->base.texture; if (!surf->color_initialized) { si_initialize_color_surface(sctx, surf); @@ -2635,20 +2637,23 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, if (surf->color_is_int10) sctx->framebuffer.color_is_int10 |= 1 << i; if (rtex->fmask.size) { sctx->framebuffer.compressed_cb_mask |= 1 << i; } if (rtex->surface.is_linear) sctx->framebuffer.any_dst_linear = true; + if (vi_dcc_enabled(rtex, surf->base.u.tex.level)) + sctx->framebuffer.CB_has_shader_readable_metadata = true; + r600_context_add_resource_size(ctx, surf->base.texture); p_atomic_inc(&rtex->framebuffers_bound); if (rtex->dcc_gather_statistics) { /* Dirty tracking must be enabled for DCC usage analysis. */ sctx->framebuffer.compressed_cb_mask |= 1 << i; vi_separate_dcc_start_query(ctx, rtex); } } @@ -4015,21 +4020,22 @@ static void si_set_tess_state(struct pipe_context *ctx, static void si_texture_barrier(struct pipe_context *ctx, unsigned flags) { struct si_context *sctx = (struct si_context *)ctx; si_update_fb_dirtiness_after_rendering(sctx); /* Multisample surfaces are flushed in si_decompress_textures. */ if (sctx->framebuffer.nr_samples <= 1 && sctx->framebuffer.state.nr_cbufs) - si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples); + si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples, + sctx->framebuffer.CB_has_shader_readable_metadata); } /* This only ensures coherency for shader image/buffer stores. */ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags) { struct si_context *sctx = (struct si_context *)ctx; /* Subsequent commands must wait for all shader invocations to * complete. */ sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | @@ -4060,23 +4066,28 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags) } /* MSAA color, any depth and any stencil are flushed in * si_decompress_textures when needed. */ if (flags & PIPE_BARRIER_FRAMEBUFFER && sctx->framebuffer.nr_samples <= 1 && sctx->framebuffer.state.nr_cbufs) { sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_CB; - /* Single-sample color is coherent with TC on GFX9. */ - if (sctx->screen->b.chip_class <= VI) + if (sctx->b.chip_class >= GFX9) { + /* Single-sample color is coherent with TC on GFX9. */ + if (sctx->framebuffer.CB_has_shader_readable_metadata) + sctx->b.flags |= SI_CONTEXT_INV_L2_METADATA; + } else { + /* SI-CI-VI */ sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2; + } } /* Indirect buffers use TC L2 on GFX9, but not older hw. */ if (sctx->screen->b.chip_class <= VI && flags & PIPE_BARRIER_INDIRECT_BUFFER) sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2; } static void *si_create_blend_custom(struct si_context *sctx, unsigned mode) { diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index 2796427..b981676 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -980,27 +980,26 @@ void si_emit_cache_flush(struct si_context *sctx) * All operations that invalidate L2 also seem to invalidate * metadata. Volatile (VOL) and WC flushes are not listed here. * * TC | TC_WB = writeback & invalidate L2 & L1 * TC | TC_WB | TC_NC = writeback & invalidate L2 for MTYPE == NC * TC_WB | TC_NC = writeback L2 for MTYPE == NC * TC | TC_NC = invalidate L2 for MTYPE == NC * TC | TC_MD = writeback & invalidate L2 metadata (DCC, etc.) * TCL1 = invalidate L1 */ + tc_flags = 0; - /* When flushing CB or DB, L2 metadata should always be invali- - * dated before texturing. Invalidating L2 data is not needed - * in some cases. - */ - tc_flags = EVENT_TC_ACTION_ENA | - EVENT_TC_MD_ACTION_ENA; + if (rctx->flags & SI_CONTEXT_INV_L2_METADATA) { + tc_flags = EVENT_TC_ACTION_ENA | + EVENT_TC_MD_ACTION_ENA; + } /* Ideally flush TC together with CB/DB. */ if (rctx->flags & SI_CONTEXT_INV_GLOBAL_L2) { /* Writeback and invalidate everything in L2 & L1. */ tc_flags = EVENT_TC_ACTION_ENA | EVENT_TC_WB_ACTION_ENA; /* Clear the flags. */ rctx->flags &= ~(SI_CONTEXT_INV_GLOBAL_L2 | SI_CONTEXT_WRITEBACK_GLOBAL_L2 | -- 2.7.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev