From: Marek Olšák <marek.ol...@amd.com>

I'd like to be able to move the prefetch call site around.
---
 src/gallium/drivers/radeonsi/si_cp_dma.c        | 7 +++----
 src/gallium/drivers/radeonsi/si_descriptors.c   | 2 +-
 src/gallium/drivers/radeonsi/si_hw_context.c    | 2 +-
 src/gallium/drivers/radeonsi/si_pipe.h          | 3 ++-
 src/gallium/drivers/radeonsi/si_state.h         | 1 -
 src/gallium/drivers/radeonsi/si_state_draw.c    | 3 +++
 src/gallium/drivers/radeonsi/si_state_shaders.c | 2 +-
 7 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c 
b/src/gallium/drivers/radeonsi/si_cp_dma.c
index e42f260..9f0e506 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -443,21 +443,21 @@ static void cik_prefetch_shader_async(struct si_context 
*sctx,
                                      struct si_pm4_state *state)
 {
        if (state) {
                struct pipe_resource *bo = &state->bo[0]->b.b;
                assert(state->nbo == 1);
 
                cik_prefetch_TC_L2_async(sctx, bo, 0, bo->width0);
        }
 }
 
-static void cik_emit_prefetch_L2(struct si_context *sctx, struct r600_atom 
*atom)
+void cik_emit_prefetch_L2(struct si_context *sctx)
 {
        /* Prefetch shaders and VBO descriptors to TC L2. */
        if (si_pm4_state_changed(sctx, ls))
                cik_prefetch_shader_async(sctx, sctx->queued.named.ls);
        if (si_pm4_state_changed(sctx, hs))
                cik_prefetch_shader_async(sctx, sctx->queued.named.hs);
        if (si_pm4_state_changed(sctx, es))
                cik_prefetch_shader_async(sctx, sctx->queued.named.es);
        if (si_pm4_state_changed(sctx, gs))
                cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
@@ -466,19 +466,18 @@ static void cik_emit_prefetch_L2(struct si_context *sctx, 
struct r600_atom *atom
 
        /* Vertex buffer descriptors are uploaded uncached, so prefetch
         * them right after the VS binary. */
        if (sctx->vertex_buffer_pointer_dirty) {
                cik_prefetch_TC_L2_async(sctx, 
&sctx->vertex_buffers.buffer->b.b,
                                         sctx->vertex_buffers.buffer_offset,
                                         
sctx->vertex_elements->desc_list_byte_size);
        }
        if (si_pm4_state_changed(sctx, ps))
                cik_prefetch_shader_async(sctx, sctx->queued.named.ps);
+
+       sctx->prefetch_L2 = false;
 }
 
 void si_init_cp_dma_functions(struct si_context *sctx)
 {
        sctx->b.clear_buffer = si_clear_buffer;
-
-       si_init_atom(sctx, &sctx->prefetch_L2, &sctx->atoms.s.prefetch_L2,
-                    cik_emit_prefetch_L2);
 }
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c 
b/src/gallium/drivers/radeonsi/si_descriptors.c
index ea5b89e..917b0e1 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -1170,21 +1170,21 @@ bool si_upload_vertex_buffer_descriptors(struct 
si_context *sctx)
                                              RADEON_USAGE_READ, 
RADEON_PRIO_VERTEX_BUFFER);
                }
        }
 
        /* Don't flush the const cache. It would have a very negative effect
         * on performance (confirmed by testing). New descriptors are always
         * uploaded to a fresh new buffer, so I don't think flushing the const
         * cache is needed. */
        si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom);
        if (sctx->b.chip_class >= CIK)
-               si_mark_atom_dirty(sctx, &sctx->prefetch_L2);
+               sctx->prefetch_L2 = true;
        sctx->vertex_buffers_dirty = false;
        sctx->vertex_buffer_pointer_dirty = true;
        return true;
 }
 
 
 /* CONSTANT BUFFERS */
 
 static unsigned
 si_const_and_shader_buffer_descriptors_idx(unsigned shader)
diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c 
b/src/gallium/drivers/radeonsi/si_hw_context.c
index f2dfcc7..756b159 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -210,21 +210,21 @@ void si_begin_new_cs(struct si_context *ctx)
 
        if (ctx->ce_preamble_ib)
                si_ce_enable_loads(ctx->ce_preamble_ib);
        else if (ctx->ce_ib)
                si_ce_enable_loads(ctx->ce_ib);
 
        if (ctx->ce_ib)
                si_ce_restore_all_descriptors_at_ib_start(ctx);
 
        if (ctx->b.chip_class >= CIK)
-               si_mark_atom_dirty(ctx, &ctx->prefetch_L2);
+               ctx->prefetch_L2 = true;
 
        /* CLEAR_STATE disables all colorbuffers, so only enable bound ones. */
        ctx->framebuffer.dirty_cbufs =
                u_bit_consecutive(0, ctx->framebuffer.state.nr_cbufs);
        /* CLEAR_STATE disables the zbuffer, so only enable it if it's bound. */
        ctx->framebuffer.dirty_zsbuf = ctx->framebuffer.state.zsbuf != NULL;
        /* This should always be marked as dirty to set the framebuffer scissor
         * at least. */
        si_mark_atom_dirty(ctx, &ctx->framebuffer.atom);
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index 1984299..d213886 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -276,31 +276,31 @@ struct si_context {
        struct radeon_winsys_cs         *ce_ib;
        struct radeon_winsys_cs         *ce_preamble_ib;
        struct r600_resource            *ce_ram_saved_buffer;
        struct u_suballocator           *ce_suballocator;
        unsigned                        ce_ram_saved_offset;
        uint16_t                        total_ce_ram_allocated;
        bool                            ce_need_synchronization:1;
 
        bool                            gfx_flush_in_progress:1;
        bool                            compute_is_busy:1;
+       bool                            prefetch_L2:1;
 
        /* Atoms (direct states). */
        union si_state_atoms            atoms;
        unsigned                        dirty_atoms; /* mask */
        /* PM4 states (precomputed immutable states) */
        unsigned                        dirty_states;
        union si_state                  queued;
        union si_state                  emitted;
 
        /* Atom declarations. */
-       struct r600_atom                prefetch_L2;
        struct si_framebuffer           framebuffer;
        struct si_sample_locs           msaa_sample_locs;
        struct r600_atom                db_render_state;
        struct r600_atom                msaa_config;
        struct si_sample_mask           sample_mask;
        struct r600_atom                cb_render_state;
        unsigned                        last_cb_target_mask;
        struct si_blend_color           blend_color;
        struct r600_atom                clip_regs;
        struct si_clip_state            clip_state;
@@ -477,20 +477,21 @@ void si_resource_copy_region(struct pipe_context *ctx,
                           SI_CPDMA_SKIP_SYNC_BEFORE | \
                           SI_CPDMA_SKIP_GFX_SYNC | \
                           SI_CPDMA_SKIP_BO_LIST_UPDATE)
 
 void si_copy_buffer(struct si_context *sctx,
                    struct pipe_resource *dst, struct pipe_resource *src,
                    uint64_t dst_offset, uint64_t src_offset, unsigned size,
                    unsigned user_flags);
 void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource 
*buf,
                              uint64_t offset, unsigned size);
+void cik_emit_prefetch_L2(struct si_context *sctx);
 void si_init_cp_dma_functions(struct si_context *sctx);
 
 /* si_debug.c */
 void si_init_debug_functions(struct si_context *sctx);
 void si_check_vm_faults(struct r600_common_context *ctx,
                        struct radeon_saved_cs *saved, enum ring_type ring);
 bool si_replace_shader(unsigned num, struct ac_shader_binary *binary);
 
 /* si_dma.c */
 void si_init_dma_functions(struct si_context *sctx);
diff --git a/src/gallium/drivers/radeonsi/si_state.h 
b/src/gallium/drivers/radeonsi/si_state.h
index acc8fb7..9fbede7 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -134,21 +134,20 @@ union si_state {
                struct si_pm4_state             *ps;
        } named;
        struct si_pm4_state     *array[0];
 };
 
 #define SI_NUM_STATES (sizeof(union si_state) / sizeof(struct si_pm4_state *))
 
 union si_state_atoms {
        struct {
                /* The order matters. */
-               struct r600_atom *prefetch_L2;
                struct r600_atom *render_cond;
                struct r600_atom *streamout_begin;
                struct r600_atom *streamout_enable; /* must be after 
streamout_begin */
                struct r600_atom *framebuffer;
                struct r600_atom *msaa_sample_locs;
                struct r600_atom *db_render_state;
                struct r600_atom *msaa_config;
                struct r600_atom *sample_mask;
                struct r600_atom *cb_render_state;
                struct r600_atom *blend_color;
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c 
b/src/gallium/drivers/radeonsi/si_state_draw.c
index 5254645..3f933fe 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -1339,20 +1339,23 @@ void si_draw_vbo(struct pipe_context *ctx, const struct 
pipe_draw_info *info)
        /* GFX9 scissor bug workaround. There is also a more efficient but
         * more involved alternative workaround. */
        if (sctx->b.chip_class == GFX9 &&
            si_is_atom_dirty(sctx, &sctx->b.scissors.atom))
                sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
 
        /* Flush caches before the first state atom, which does L2 prefetches. 
*/
        if (sctx->b.flags)
                si_emit_cache_flush(sctx);
 
+       if (sctx->prefetch_L2)
+               cik_emit_prefetch_L2(sctx);
+
        /* Emit state atoms. */
        mask = sctx->dirty_atoms;
        while (mask) {
                struct r600_atom *atom = sctx->atoms.array[u_bit_scan(&mask)];
 
                atom->emit(&sctx->b, atom);
        }
        sctx->dirty_atoms = 0;
 
        /* Emit states. */
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c 
b/src/gallium/drivers/radeonsi/si_state_shaders.c
index daf4af5..0dd6402 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -3301,21 +3301,21 @@ bool si_update_shaders(struct si_context *sctx)
            si_pm4_state_changed(sctx, hs) ||
            si_pm4_state_changed(sctx, es) ||
            si_pm4_state_changed(sctx, gs) ||
            si_pm4_state_changed(sctx, vs) ||
            si_pm4_state_changed(sctx, ps)) {
                if (!si_update_spi_tmpring_size(sctx))
                        return false;
        }
 
        if (sctx->b.chip_class >= CIK)
-               si_mark_atom_dirty(sctx, &sctx->prefetch_L2);
+               sctx->prefetch_L2 = true;
 
        sctx->do_update_shaders = false;
        return true;
 }
 
 static void si_emit_scratch_state(struct si_context *sctx,
                                  struct r600_atom *atom)
 {
        struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 
-- 
2.7.4

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to