Re: [Mesa-dev] [PATCH 08/10] radeonsi: only upload (dump to L2) those descriptors that are used by shaders

Nicolai Hähnle Thu, 18 May 2017 02:31:29 -0700

On 17.05.2017 21:38, Marek Olšák wrote:

From: Marek Olšák <[email protected]>


This decreases the size of CE RAM dumps to L2, or the size of descriptor
uploads without CE.
---
 src/gallium/drivers/radeonsi/si_compute.c       | 28 ++++++--
 src/gallium/drivers/radeonsi/si_descriptors.c   | 85 ++++++++++++++++++++-----
 src/gallium/drivers/radeonsi/si_state.h         | 18 +++++-
 src/gallium/drivers/radeonsi/si_state_shaders.c |  6 ++
 4 files changed, 113 insertions(+), 24 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index 22ef111..4c98066 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -201,21 +201,38 @@ static void *si_create_compute_state(
                        return NULL;
                }
        }

        return program;
 }

 static void si_bind_compute_state(struct pipe_context *ctx, void *state)
 {
        struct si_context *sctx = (struct si_context*)ctx;
-       sctx->cs_shader_state.program = (struct si_compute*)state;
+       struct si_compute *program = (struct si_compute*)state;
+
+       sctx->cs_shader_state.program = program;
+       if (!program)
+               return;
+
+       /* Wait because we need active slot usage masks. */
+       if (program->ir_type == PIPE_SHADER_IR_TGSI)
+               util_queue_fence_wait(&program->ready);
+
+       si_set_active_descriptors(sctx,
+                                 SI_DESCS_FIRST_COMPUTE +
+                                 SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS,
+                                 program->active_const_and_shader_buffers);
+       si_set_active_descriptors(sctx,
+                                 SI_DESCS_FIRST_COMPUTE +
+                                 SI_SHADER_DESCS_SAMPLERS_AND_IMAGES,
+                                 program->active_samplers_and_images);
 }

 static void si_set_global_binding(
        struct pipe_context *ctx, unsigned first, unsigned n,
        struct pipe_resource **resources,
        uint32_t **handles)
 {
        unsigned i;
        struct si_context *sctx = (struct si_context*)ctx;
        struct si_compute *program = sctx->cs_shader_state.program;
@@ -749,26 +766,23 @@ static void si_launch_grid(
        bool cs_regalloc_hang =
                (sctx->b.chip_class == SI ||
                 sctx->b.family == CHIP_BONAIRE ||
                 sctx->b.family == CHIP_KABINI) &&
                info->block[0] * info->block[1] * info->block[2] > 256;

        if (cs_regalloc_hang)
                sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
                                 SI_CONTEXT_CS_PARTIAL_FLUSH;

-       if (program->ir_type == PIPE_SHADER_IR_TGSI) {
-               util_queue_fence_wait(&program->ready);
-
-               if (program->shader.compilation_failed)
-                       return;
-       }
+       if (program->ir_type == PIPE_SHADER_IR_TGSI &&
+           program->shader.compilation_failed)
+               return;

        si_decompress_compute_textures(sctx);

        /* Add buffer sizes for memory checking in need_cs_space. */
        r600_context_add_resource_size(ctx, &program->shader.bo->b.b);
        /* TODO: add the scratch buffer */

        if (info->indirect) {
                r600_context_add_resource_size(ctx, info->indirect);

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c 
b/src/gallium/drivers/radeonsi/si_descriptors.c
index 38e4ae1..a2f40a8 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -118,26 +118,28 @@ static void si_init_descriptors(struct si_descriptors 
*desc,
        }
 }

 static void si_release_descriptors(struct si_descriptors *desc)
 {
        r600_resource_reference(&desc->buffer, NULL);
        FREE(desc->list);
 }

 static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, unsigned 
size,
-                        unsigned *out_offset, struct r600_resource **out_buf) {
+                        unsigned *out_offset, struct r600_resource **out_buf)
+{
        uint64_t va;

        u_suballocator_alloc(sctx->ce_suballocator, size,
-                            sctx->screen->b.info.tcc_cache_line_size,
-                            out_offset, (struct pipe_resource**)out_buf);
+                            si_optimal_tcc_alignment(sctx, size),
+                            (unsigned*)out_offset,


The extra cast of out_offset is unnecessary.

+                            (struct pipe_resource**)out_buf);
        if (!out_buf)
                        return false;

        va = (*out_buf)->gpu_address + *out_offset;

        radeon_emit(sctx->ce_ib, PKT3(PKT3_DUMP_CONST_RAM, 3, 0));
        radeon_emit(sctx->ce_ib, ce_offset);
        radeon_emit(sctx->ce_ib, size / 4);
        radeon_emit(sctx->ce_ib, va);
        radeon_emit(sctx->ce_ib, va >> 32);
@@ -186,58 +188,70 @@ void si_ce_enable_loads(struct radeon_winsys_cs *ib)
        radeon_emit(ib, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
        radeon_emit(ib, CONTEXT_CONTROL_LOAD_ENABLE(1) |
                        CONTEXT_CONTROL_LOAD_CE_RAM(1));
        radeon_emit(ib, CONTEXT_CONTROL_SHADOW_ENABLE(1));
 }

 static bool si_upload_descriptors(struct si_context *sctx,
                                  struct si_descriptors *desc,
                                  struct r600_atom * atom)
 {
-       unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
+       unsigned slot_size = desc->element_dw_size * 4;
+       unsigned first_slot_offset = desc->first_active_slot * slot_size;
+       unsigned upload_size = desc->num_active_slots * slot_size;
+
+       if (!upload_size)
+               return true;

The early-out here means that desc->num_active_slots *does* control whatis written to CE RAM, contrary to what its descriptive comment says. Itneeds to be moved further down.


Cheers,
Nicolai


        if (sctx->ce_ib && desc->uses_ce) {
                uint32_t const* list = (uint32_t const*)desc->list;

                while(desc->dirty_mask) {
                        int begin, count;
                        u_bit_scan_consecutive_range64(&desc->dirty_mask, 
&begin,
                                                       &count);

                        begin *= desc->element_dw_size;
                        count *= desc->element_dw_size;

                        radeon_emit(sctx->ce_ib,
                                    PKT3(PKT3_WRITE_CONST_RAM, count, 0));
                        radeon_emit(sctx->ce_ib, desc->ce_offset + begin * 4);
                        radeon_emit_array(sctx->ce_ib, list + begin, count);
                }

-               if (!si_ce_upload(sctx, desc->ce_offset, list_size,
-                                          &desc->buffer_offset, &desc->buffer))
+               if (!si_ce_upload(sctx, desc->ce_offset + first_slot_offset,
+                                 upload_size, (unsigned*)&desc->buffer_offset,
+                                 &desc->buffer))
                        return false;
        } else {
-               void *ptr;
+               uint32_t *ptr;

-               u_upload_alloc(sctx->b.b.const_uploader, 0, list_size,
-                              sctx->screen->b.info.tcc_cache_line_size,
-                              &desc->buffer_offset,
-                              (struct pipe_resource**)&desc->buffer, &ptr);
+               u_upload_alloc(sctx->b.b.const_uploader, 0, upload_size,
+                              si_optimal_tcc_alignment(sctx, upload_size),
+                              (unsigned*)&desc->buffer_offset,
+                              (struct pipe_resource**)&desc->buffer,
+                              (void**)&ptr);
                if (!desc->buffer)
                        return false; /* skip the draw call */

-               util_memcpy_cpu_to_le32(ptr, desc->list, list_size);
-               desc->gpu_list = ptr;
+               util_memcpy_cpu_to_le32(ptr, (char*)desc->list + 
first_slot_offset,
+                                       upload_size);
+               desc->gpu_list = ptr - first_slot_offset / 4;

                radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer,
                                    RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
        }
+
+       /* The shader pointer should point to slot 0. */
+       desc->buffer_offset -= first_slot_offset;
+
        desc->dirty_mask = 0;

        if (atom)
                si_mark_atom_dirty(sctx, atom);

        return true;
 }

 static void
 si_descriptors_begin_new_cs(struct si_context *sctx, struct si_descriptors 
*desc)
@@ -1023,21 +1037,21 @@ bool si_upload_vertex_buffer_descriptors(struct 
si_context *sctx)
        desc_list_byte_size = velems->desc_list_byte_size;
        first_vb_use_mask = velems->first_vb_use_mask;

        /* Vertex buffer descriptors are the only ones which are uploaded
         * directly through a staging buffer and don't go through
         * the fine-grained upload path.
         */
        u_upload_alloc(sctx->b.b.const_uploader, 0,
                       desc_list_byte_size,
                       si_optimal_tcc_alignment(sctx, desc_list_byte_size),
-                      &desc->buffer_offset,
+                      (unsigned*)&desc->buffer_offset,
                       (struct pipe_resource**)&desc->buffer, (void**)&ptr);
        if (!desc->buffer)
                return false;

        radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
                              desc->buffer, RADEON_USAGE_READ,
                              RADEON_PRIO_DESCRIPTORS);

        assert(count <= SI_MAX_ATTRIBS);

@@ -1883,21 +1897,22 @@ void si_shader_change_notify(struct si_context *sctx)
        }
 }

 static void si_emit_shader_pointer(struct si_context *sctx,
                                   struct si_descriptors *desc,
                                   unsigned sh_base)
 {
        struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
        uint64_t va;

-       assert(desc->buffer);
+       if (!desc->buffer)
+               return; /* the pointer is not used by current shaders */

        va = desc->buffer->gpu_address +
             desc->buffer_offset;

        radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
        radeon_emit(cs, (sh_base + desc->shader_userdata_offset - SI_SH_REG_OFFSET) 
>> 2);
        radeon_emit(cs, va);
        radeon_emit(cs, va >> 32);
 }

@@ -2026,20 +2041,22 @@ void si_init_all_descriptors(struct si_context *sctx)
        }

        si_init_buffer_resources(&sctx->rw_buffers,
                                 &sctx->descriptors[SI_DESCS_RW_BUFFERS],
                                 SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
                                 /* The second set of usage/priority is used by
                                  * const buffers in RW buffer slots. */
                                 RADEON_USAGE_READWRITE, RADEON_USAGE_READ,
                                 RADEON_PRIO_SHADER_RINGS, 
RADEON_PRIO_CONST_BUFFER,
                                 &ce_offset);
+       sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots = 
SI_NUM_RW_BUFFERS;
+
        si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS,
                            4, SI_NUM_VERTEX_BUFFERS, NULL);

        sctx->descriptors_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
        sctx->total_ce_ram_allocated = ce_offset;

        if (sctx->b.chip_class >= GFX9)
                assert(ce_offset <= 4096);
        else
                assert(ce_offset <= 32768);
@@ -2148,10 +2165,48 @@ void si_all_descriptors_begin_new_cs(struct si_context 
*sctx)
                si_image_views_begin_new_cs(sctx, &sctx->images[i]);
        }
        si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers);
        si_vertex_buffers_begin_new_cs(sctx);

        for (i = 0; i < SI_NUM_DESCS; ++i)
                si_descriptors_begin_new_cs(sctx, &sctx->descriptors[i]);

        si_shader_userdata_begin_new_cs(sctx);
 }
+
+void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx,
+                              uint64_t new_active_mask)
+{
+       struct si_descriptors *desc = &sctx->descriptors[desc_idx];
+
+       /* Ignore no-op updates and updates that disable all slots. */
+       if (!new_active_mask ||
+           new_active_mask == u_bit_consecutive64(desc->first_active_slot,
+                                                  desc->num_active_slots))
+               return;
+
+       int first, count;
+       u_bit_scan_consecutive_range64(&new_active_mask, &first, &count);
+       assert(new_active_mask == 0);
+
+       /* Upload/dump descriptors if slots are being enabled. */
+       if (first < desc->first_active_slot ||
+           first + count > desc->first_active_slot + desc->num_active_slots)
+               sctx->descriptors_dirty |= 1u << desc_idx;
+
+       desc->first_active_slot = first;
+       desc->num_active_slots = count;
+}
+
+void si_set_active_descriptors_for_shader(struct si_context *sctx,
+                                         struct si_shader_selector *sel)
+{
+       if (!sel)
+               return;
+
+       si_set_active_descriptors(sctx,
+               si_const_and_shader_buffer_descriptors_idx(sel->type),
+               sel->active_const_and_shader_buffers);
+       si_set_active_descriptors(sctx,
+               si_sampler_and_image_descriptors_idx(sel->type),
+               sel->active_samplers_and_images);
+}
diff --git a/src/gallium/drivers/radeonsi/si_state.h 
b/src/gallium/drivers/radeonsi/si_state.h
index f2003a5..dfabaa3 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -35,20 +35,21 @@

 #define SI_MAX_ATTRIBS                 16
 #define SI_NUM_VERTEX_BUFFERS          SI_MAX_ATTRIBS
 #define SI_NUM_SAMPLERS                        32 /* OpenGL textures units per 
shader */
 #define SI_NUM_CONST_BUFFERS           16
 #define SI_NUM_IMAGES                  16
 #define SI_NUM_SHADER_BUFFERS          16

 struct si_screen;
 struct si_shader;
+struct si_shader_selector;

 struct si_state_blend {
        struct si_pm4_state     pm4;
        uint32_t                cb_target_mask;
        bool                    alpha_to_coverage;
        bool                    alpha_to_one;
        bool                    dual_src_blend;
        /* Set 0xf or 0x0 (4 bits) per render target if the following is
         * true. ANDed with spi_shader_col_format.
         */
@@ -215,26 +216,34 @@ struct si_descriptors {
        uint32_t *list;
        /* The list in mapped GPU memory. */
        uint32_t *gpu_list;
        /* The size of one descriptor. */
        unsigned element_dw_size;
        /* The maximum number of descriptors. */
        unsigned num_elements;

        /* The buffer where the descriptors have been uploaded. */
        struct r600_resource *buffer;
-       unsigned buffer_offset;
+       int buffer_offset; /* can be negative if not using lower slots */

        /* Offset in CE RAM */
        unsigned ce_offset;

-       /* elements of the list that are changed and need to be uploaded */
+       /* Slots that are used by currently-bound shaders.
+        * With CE: It determines which slots are dumped to L2.
+        *          It doesn't skip uploads to CE RAM.
+        * Without CE: It determines which slots are uploaded.
+        */
+       unsigned first_active_slot;
+       unsigned num_active_slots;
+
+       /* Slots that have been changed and need to be uploaded. */
        uint64_t dirty_mask;

        /* Whether CE is used to upload this descriptor array. */
        bool uses_ce;

        /* The shader userdata offset within a shader where the 64-bit pointer 
to the descriptor
         * array will be stored. */
        unsigned shader_userdata_offset;
 };

@@ -308,20 +317,25 @@ void si_all_descriptors_begin_new_cs(struct si_context 
*sctx);
 void si_upload_const_buffer(struct si_context *sctx, struct r600_resource 
**rbuffer,
                            const uint8_t *ptr, unsigned size, uint32_t 
*const_offset);
 void si_update_all_texture_descriptors(struct si_context *sctx);
 void si_shader_change_notify(struct si_context *sctx);
 void si_update_compressed_colortex_masks(struct si_context *sctx);
 void si_emit_graphics_shader_userdata(struct si_context *sctx,
                                       struct r600_atom *atom);
 void si_emit_compute_shader_userdata(struct si_context *sctx);
 void si_set_rw_buffer(struct si_context *sctx,
                      uint slot, const struct pipe_constant_buffer *input);
+void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx,
+                              uint64_t new_active_mask);
+void si_set_active_descriptors_for_shader(struct si_context *sctx,
+                                         struct si_shader_selector *sel);
+
 /* si_state.c */
 struct si_shader_selector;

 void si_init_atom(struct si_context *sctx, struct r600_atom *atom,
                  struct r600_atom **list_elem,
                  void (*emit_func)(struct si_context *ctx, struct r600_atom 
*state));
 void si_init_state_functions(struct si_context *sctx);
 void si_init_screen_state_functions(struct si_screen *sscreen);
 void
 si_make_buffer_descriptor(struct si_screen *screen, struct r600_resource *buf,
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c 
b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 45d996b..8ac4309 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -2144,20 +2144,21 @@ static void si_bind_vs_shader(struct pipe_context *ctx, 
void *state)
        struct si_shader_selector *sel = state;

        if (sctx->vs_shader.cso == sel)
                return;

        sctx->vs_shader.cso = sel;
        sctx->vs_shader.current = sel ? sel->first_variant : NULL;
        sctx->do_update_shaders = true;
        si_mark_atom_dirty(sctx, &sctx->clip_regs);
        r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx));
+       si_set_active_descriptors_for_shader(sctx, sel);
 }

 static void si_update_tess_uses_prim_id(struct si_context *sctx)
 {
        sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id =
                (sctx->tes_shader.cso &&
                 sctx->tes_shader.cso->info.uses_primid) ||
                (sctx->tcs_shader.cso &&
                 sctx->tcs_shader.cso->info.uses_primid) ||
                (sctx->gs_shader.cso &&
@@ -2181,38 +2182,41 @@ static void si_bind_gs_shader(struct pipe_context *ctx, 
void *state)
        sctx->do_update_shaders = true;
        si_mark_atom_dirty(sctx, &sctx->clip_regs);
        sctx->last_rast_prim = -1; /* reset this so that it gets updated */

        if (enable_changed) {
                si_shader_change_notify(sctx);
                if (sctx->ia_multi_vgt_param_key.u.uses_tess)
                        si_update_tess_uses_prim_id(sctx);
        }
        r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx));
+       si_set_active_descriptors_for_shader(sctx, sel);
 }

 static void si_bind_tcs_shader(struct pipe_context *ctx, void *state)
 {
        struct si_context *sctx = (struct si_context *)ctx;
        struct si_shader_selector *sel = state;
        bool enable_changed = !!sctx->tcs_shader.cso != !!sel;

        if (sctx->tcs_shader.cso == sel)
                return;

        sctx->tcs_shader.cso = sel;
        sctx->tcs_shader.current = sel ? sel->first_variant : NULL;
        si_update_tess_uses_prim_id(sctx);
        sctx->do_update_shaders = true;

        if (enable_changed)
                sctx->last_tcs = NULL; /* invalidate derived tess state */
+
+       si_set_active_descriptors_for_shader(sctx, sel);
 }

 static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
 {
        struct si_context *sctx = (struct si_context *)ctx;
        struct si_shader_selector *sel = state;
        bool enable_changed = !!sctx->tes_shader.cso != !!sel;

        if (sctx->tes_shader.cso == sel)
                return;
@@ -2223,37 +2227,39 @@ static void si_bind_tes_shader(struct pipe_context 
*ctx, void *state)
        si_update_tess_uses_prim_id(sctx);
        sctx->do_update_shaders = true;
        si_mark_atom_dirty(sctx, &sctx->clip_regs);
        sctx->last_rast_prim = -1; /* reset this so that it gets updated */

        if (enable_changed) {
                si_shader_change_notify(sctx);
                sctx->last_tes_sh_base = -1; /* invalidate derived tess state */
        }
        r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx));
+       si_set_active_descriptors_for_shader(sctx, sel);
 }

 static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
 {
        struct si_context *sctx = (struct si_context *)ctx;
        struct si_shader_selector *sel = state;

        /* skip if supplied shader is one already in use */
        if (sctx->ps_shader.cso == sel)
                return;

        sctx->ps_shader.cso = sel;
        sctx->ps_shader.current = sel ? sel->first_variant : NULL;
        sctx->do_update_shaders = true;
        if (sel && sctx->ia_multi_vgt_param_key.u.uses_tess)
                si_update_tess_uses_prim_id(sctx);
        si_mark_atom_dirty(sctx, &sctx->cb_render_state);
+       si_set_active_descriptors_for_shader(sctx, sel);
 }

 static void si_delete_shader(struct si_context *sctx, struct si_shader *shader)
 {
        if (shader->is_optimized) {
                util_queue_fence_wait(&shader->optimized_ready);
                util_queue_fence_destroy(&shader->optimized_ready);
        }

        if (shader->pm4) {



--
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.
_______________________________________________
mesa-dev mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH 08/10] radeonsi: only upload (dump to L2) those descriptors that are used by shaders

Reply via email to