[Mesa-dev] [PATCH] radeonsi: pass the scratch buffer via user SGPRs on LLVM 4.0

Marek Olšák Fri, 09 Dec 2016 09:18:37 -0800

From: Marek Olšák <marek.ol...@amd.com>

TGSI compute shaders don't have RW_BUFFERS, so use SGPR[0:1].
Graphics shaders use the first slot of RW_BUFFERS.


TODO: Dave's patch only implements the latter; fix the attribute names.

UNTESTED
---
 src/gallium/drivers/radeonsi/si_compute.c       |  27 +++++--
 src/gallium/drivers/radeonsi/si_shader.c        |  34 +++++---
 src/gallium/drivers/radeonsi/si_shader.h        |   1 +
 src/gallium/drivers/radeonsi/si_state.h         |   1 +
 src/gallium/drivers/radeonsi/si_state_draw.c    |   8 ++
 src/gallium/drivers/radeonsi/si_state_shaders.c | 102 +++++++++++++-----------
 6 files changed, 111 insertions(+), 62 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index 9d83cb3..8a4c02e 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -287,21 +287,23 @@ static bool si_setup_compute_scratch_buffer(struct 
si_context *sctx,
                r600_resource_reference(&sctx->compute_scratch_buffer, NULL);
 
                sctx->compute_scratch_buffer = (struct r600_resource*)
                        pipe_buffer_create(&sctx->screen->b.b, 0,
                                           PIPE_USAGE_DEFAULT, scratch_needed);
 
                if (!sctx->compute_scratch_buffer)
                        return false;
        }
 
-       if (sctx->compute_scratch_buffer != shader->scratch_bo && 
scratch_needed) {
+       if (HAVE_LLVM <= 0x0309 &&
+           scratch_needed &&
+           sctx->compute_scratch_buffer != shader->scratch_bo) {
                uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address;
 
                si_shader_apply_scratch_relocs(sctx, shader, config, 
scratch_va);
 
                if (si_shader_binary_upload(sctx->screen, shader))
                        return false;
 
                r600_resource_reference(&shader->scratch_bo,
                                        sctx->compute_scratch_buffer);
        }
@@ -351,30 +353,43 @@ static bool si_switch_compute_shader(struct si_context 
*sctx,
                /* TODO: use si_multiwave_lds_size_workaround */
                assert(lds_blocks <= 0xFF);
 
                config->rsrc2 &= C_00B84C_LDS_SIZE;
                config->rsrc2 |=  S_00B84C_LDS_SIZE(lds_blocks);
        }
 
        if (!si_setup_compute_scratch_buffer(sctx, shader, config))
                return false;
 
-       if (shader->scratch_bo) {
+       if (config->scratch_bytes_per_wave) {
                COMPUTE_DBG(sctx->screen, "Waves: %u; Scratch per wave: %u 
bytes; "
                            "Total Scratch: %u bytes\n", sctx->scratch_waves,
                            config->scratch_bytes_per_wave,
                            config->scratch_bytes_per_wave *
                            sctx->scratch_waves);
 
                radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
-                             shader->scratch_bo, RADEON_USAGE_READWRITE,
-                             RADEON_PRIO_SCRATCH_BUFFER);
+                                         sctx->compute_scratch_buffer,
+                                         RADEON_USAGE_READWRITE,
+                                         RADEON_PRIO_SCRATCH_BUFFER);
+
+               /* Write the scratch pointer to SGPR[0:1]. */
+               if (HAVE_LLVM >= 0x0400 &&
+                   program->ir_type == PIPE_SHADER_IR_TGSI) {
+                       uint64_t scratch_va = 
sctx->compute_scratch_buffer->gpu_address;
+
+                       radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 
2);
+                       radeon_emit(cs, scratch_va);
+                       radeon_emit(cs,
+                                   S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) |
+                                   S_008F04_SWIZZLE_ENABLE(1));
+               }
        }
 
        shader_va = shader->bo->gpu_address + offset;
        if (program->use_code_object_v2) {
                /* Shader code is placed after the amd_kernel_code_t
                 * struct. */
                shader_va += sizeof(amd_kernel_code_t);
        }
 
        radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, shader->bo,
@@ -729,21 +744,23 @@ static void si_launch_grid(
 
        si_upload_compute_shader_descriptors(sctx);
        si_emit_compute_shader_userdata(sctx);
 
        if (si_is_atom_dirty(sctx, sctx->atoms.s.render_cond)) {
                sctx->atoms.s.render_cond->emit(&sctx->b,
                                                sctx->atoms.s.render_cond);
                si_set_atom_dirty(sctx, sctx->atoms.s.render_cond, false);
        }
 
-       if (program->input_size || program->ir_type == PIPE_SHADER_IR_NATIVE)
+       if (program->ir_type == PIPE_SHADER_IR_TGSI)
+               assert(program->input_size == 0);
+       else if (program->ir_type == PIPE_SHADER_IR_NATIVE)
                si_upload_compute_input(sctx, code_object, info);
 
        /* Global buffers */
        for (i = 0; i < MAX_GLOBAL_BUFFERS; i++) {
                struct r600_resource *buffer =
                                (struct 
r600_resource*)program->global_buffers[i];
                if (!buffer) {
                        continue;
                }
                radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, buffer,
diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
b/src/gallium/drivers/radeonsi/si_shader.c
index ed8eff4..507a44d 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -5321,20 +5321,28 @@ static void si_create_function(struct si_shader_context 
*ctx,
                LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
                                                   "no-infs-fp-math",
                                                   "true");
                LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
                                                   "no-nans-fp-math",
                                                   "true");
                LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
                                                   "unsafe-fp-math",
                                                   "true");
        }
+
+       if (ctx->type == PIPE_SHADER_COMPUTE) {
+               LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
+                                                  "amdgpu-spill-bufsgpr01", 
"true");
+       } else {
+               LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
+                                                  
"amdgpu-spill-bufsgpr01-load", "true");
+       }
 }
 
 static void create_meta_data(struct si_shader_context *ctx)
 {
        struct gallivm_state *gallivm = ctx->soa.bld_base.base.gallivm;
 
        ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(gallivm->context,
                                                               
"invariant.load", 14);
        ctx->range_md_kind = LLVMGetMDKindIDInContext(gallivm->context,
                                                     "range", 5);
@@ -5762,32 +5770,36 @@ static void si_llvm_emit_polygon_stipple(struct 
si_shader_context *ctx,
        lp_build_intrinsic(builder, "llvm.AMDGPU.kill", ctx->voidt, &bit, 1, 0);
 }
 
 void si_shader_binary_read_config(struct radeon_shader_binary *binary,
                                  struct si_shader_config *conf,
                                  unsigned symbol_offset)
 {
        unsigned i;
        const unsigned char *config =
                radeon_shader_binary_config_start(binary, symbol_offset);
-       bool really_needs_scratch = false;
+       bool may_need_scratch = true;
 
-       /* LLVM adds SGPR spills to the scratch size.
-        * Find out if we really need the scratch buffer.
-        */
-       for (i = 0; i < binary->reloc_count; i++) {
-               const struct radeon_shader_reloc *reloc = &binary->relocs[i];
+       if (HAVE_LLVM <= 0x0309) {
+               /* LLVM adds SGPR spills to the scratch size.
+                * Find out if we really need the scratch buffer.
+                */
+               may_need_scratch = false;
 
-               if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
-                   !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
-                       really_needs_scratch = true;
-                       break;
+               for (i = 0; i < binary->reloc_count; i++) {
+                       const struct radeon_shader_reloc *reloc = 
&binary->relocs[i];
+
+                       if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
+                           !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
+                               may_need_scratch = true;
+                               break;
+                       }
                }
        }
 
        /* XXX: We may be able to emit some of these values directly rather than
         * extracting fields to be emitted later.
         */
 
        for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
                unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i));
                unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i + 4));
@@ -5810,21 +5822,21 @@ void si_shader_binary_read_config(struct 
radeon_shader_binary *binary,
                        break;
                case R_0286CC_SPI_PS_INPUT_ENA:
                        conf->spi_ps_input_ena = value;
                        break;
                case R_0286D0_SPI_PS_INPUT_ADDR:
                        conf->spi_ps_input_addr = value;
                        break;
                case R_0286E8_SPI_TMPRING_SIZE:
                case R_00B860_COMPUTE_TMPRING_SIZE:
                        /* WAVESIZE is in units of 256 dwords. */
-                       if (really_needs_scratch)
+                       if (may_need_scratch)
                                conf->scratch_bytes_per_wave =
                                        G_00B860_WAVESIZE(value) * 256 * 4;
                        break;
                case 0x4: /* SPILLED_SGPRS */
                        conf->spilled_sgprs = value;
                        break;
                case 0x8: /* SPILLED_VGPRS */
                        conf->spilled_vgprs = value;
                        break;
                default:
diff --git a/src/gallium/drivers/radeonsi/si_shader.h 
b/src/gallium/drivers/radeonsi/si_shader.h
index 129e571..b30f61b 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -483,20 +483,21 @@ struct si_shader_info {
 
 struct si_shader {
        struct si_shader_selector       *selector;
        struct si_shader                *next_variant;
 
        struct si_shader_part           *prolog;
        struct si_shader_part           *epilog;
 
        struct si_pm4_state             *pm4;
        struct r600_resource            *bo;
+       /* for tracking which scratch address the binary contains (<= LLVM 3.9) 
*/
        struct r600_resource            *scratch_bo;
        struct si_shader_key            key;
        struct util_queue_fence         optimized_ready;
        bool                            compilation_failed;
        bool                            is_monolithic;
        bool                            is_optimized;
        bool                            is_binary_shared;
        bool                            is_gs_copy_shader;
 
        /* The following data is all that's needed for binary shaders. */
diff --git a/src/gallium/drivers/radeonsi/si_state.h 
b/src/gallium/drivers/radeonsi/si_state.h
index d8e6024..b6b089a 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -154,20 +154,21 @@ union si_state_atoms {
 
 #define SI_NUM_ATOMS (sizeof(union si_state_atoms)/sizeof(struct r600_atom*))
 
 struct si_shader_data {
        struct r600_atom        atom;
        uint32_t                sh_base[SI_NUM_SHADERS];
 };
 
 /* Private read-write buffer slots. */
 enum {
+       SI_SCRATCH_BUFFER,
        SI_HS_RING_TESS_FACTOR,
        SI_HS_RING_TESS_OFFCHIP,
 
        SI_ES_RING_ESGS,
        SI_GS_RING_ESGS,
 
        SI_GS_RING_GSVS0,
        SI_GS_RING_GSVS1,
        SI_GS_RING_GSVS2,
        SI_GS_RING_GSVS3,
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c 
b/src/gallium/drivers/radeonsi/si_state_draw.c
index cae19dc..e447e32 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -1016,20 +1016,28 @@ void si_draw_vbo(struct pipe_context *ctx, const struct 
pipe_draw_info *info)
 
                if (gs_tri_strip_adj_fix != sctx->gs_tri_strip_adj_fix) {
                        sctx->gs_tri_strip_adj_fix = gs_tri_strip_adj_fix;
                        sctx->do_update_shaders = true;
                }
        }
 
        if (sctx->do_update_shaders && !si_update_shaders(sctx))
                return;
 
+       /* Do it after si_update_shaders, but before
+        * si_upload_graphics_shader_descriptors. */
+       if (HAVE_LLVM >= 0x0400 && sctx->emit_scratch_reloc) {
+               si_set_ring_buffer(ctx, SI_SCRATCH_BUFFER,
+                                  &sctx->scratch_buffer->b.b,
+                                  0, 0xffffffff, true, true, 4, 64, 0);
+       }
+
        if (!si_upload_graphics_shader_descriptors(sctx))
                return;
 
        if (info->indexed) {
                /* Initialize the index buffer struct. */
                pipe_resource_reference(&ib.buffer, sctx->index_buffer.buffer);
                ib.user_buffer = sctx->index_buffer.user_buffer;
                ib.index_size = sctx->index_buffer.index_size;
                ib.offset = sctx->index_buffer.offset;
 
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c 
b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 0afc3b4..bb9f3a8 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -2130,90 +2130,100 @@ static unsigned 
si_get_max_scratch_bytes_per_wave(struct si_context *sctx)
        unsigned bytes = 0;
 
        bytes = MAX2(bytes, 
si_get_scratch_buffer_bytes_per_wave(sctx->ps_shader.current));
        bytes = MAX2(bytes, 
si_get_scratch_buffer_bytes_per_wave(sctx->gs_shader.current));
        bytes = MAX2(bytes, 
si_get_scratch_buffer_bytes_per_wave(sctx->vs_shader.current));
        bytes = MAX2(bytes, 
si_get_scratch_buffer_bytes_per_wave(sctx->tcs_shader.current));
        bytes = MAX2(bytes, 
si_get_scratch_buffer_bytes_per_wave(sctx->tes_shader.current));
        return bytes;
 }
 
+static bool si_update_scratch_relocs(struct si_context *sctx)
+{
+       int r;
+
+       /* Update the shaders, so they are using the latest scratch.  The
+        * scratch buffer may have been changed since these shaders were
+        * last used, so we still need to try to update them, even if
+        * they require scratch buffers smaller than the current size.
+        */
+       r = si_update_scratch_buffer(sctx, sctx->ps_shader.current);
+       if (r < 0)
+               return false;
+       if (r == 1)
+               si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4);
+
+       r = si_update_scratch_buffer(sctx, sctx->gs_shader.current);
+       if (r < 0)
+               return false;
+       if (r == 1)
+               si_pm4_bind_state(sctx, gs, sctx->gs_shader.current->pm4);
+
+       r = si_update_scratch_buffer(sctx, sctx->tcs_shader.current);
+       if (r < 0)
+               return false;
+       if (r == 1)
+               si_pm4_bind_state(sctx, hs, sctx->tcs_shader.current->pm4);
+
+       /* VS can be bound as LS, ES, or VS. */
+       r = si_update_scratch_buffer(sctx, sctx->vs_shader.current);
+       if (r < 0)
+               return false;
+       if (r == 1) {
+               if (sctx->tes_shader.current)
+                       si_pm4_bind_state(sctx, ls, 
sctx->vs_shader.current->pm4);
+               else if (sctx->gs_shader.current)
+                       si_pm4_bind_state(sctx, es, 
sctx->vs_shader.current->pm4);
+               else
+                       si_pm4_bind_state(sctx, vs, 
sctx->vs_shader.current->pm4);
+       }
+
+       /* TES can be bound as ES or VS. */
+       r = si_update_scratch_buffer(sctx, sctx->tes_shader.current);
+       if (r < 0)
+               return false;
+       if (r == 1) {
+               if (sctx->gs_shader.current)
+                       si_pm4_bind_state(sctx, es, 
sctx->tes_shader.current->pm4);
+               else
+                       si_pm4_bind_state(sctx, vs, 
sctx->tes_shader.current->pm4);
+       }
+
+       return true;
+}
+
 static bool si_update_spi_tmpring_size(struct si_context *sctx)
 {
        unsigned current_scratch_buffer_size =
                si_get_current_scratch_buffer_size(sctx);
        unsigned scratch_bytes_per_wave =
                si_get_max_scratch_bytes_per_wave(sctx);
        unsigned scratch_needed_size = scratch_bytes_per_wave *
                sctx->scratch_waves;
        unsigned spi_tmpring_size;
-       int r;
 
        if (scratch_needed_size > 0) {
                if (scratch_needed_size > current_scratch_buffer_size) {
                        /* Create a bigger scratch buffer */
                        r600_resource_reference(&sctx->scratch_buffer, NULL);
 
                        sctx->scratch_buffer = (struct r600_resource*)
                                        pipe_buffer_create(&sctx->screen->b.b, 
0,
                                        PIPE_USAGE_DEFAULT, 
scratch_needed_size);
                        if (!sctx->scratch_buffer)
                                return false;
                        sctx->emit_scratch_reloc = true;
                }
 
-               /* Update the shaders, so they are using the latest scratch.  
The
-                * scratch buffer may have been changed since these shaders were
-                * last used, so we still need to try to update them, even if
-                * they require scratch buffers smaller than the current size.
-                */
-               r = si_update_scratch_buffer(sctx, sctx->ps_shader.current);
-               if (r < 0)
-                       return false;
-               if (r == 1)
-                       si_pm4_bind_state(sctx, ps, 
sctx->ps_shader.current->pm4);
-
-               r = si_update_scratch_buffer(sctx, sctx->gs_shader.current);
-               if (r < 0)
-                       return false;
-               if (r == 1)
-                       si_pm4_bind_state(sctx, gs, 
sctx->gs_shader.current->pm4);
-
-               r = si_update_scratch_buffer(sctx, sctx->tcs_shader.current);
-               if (r < 0)
-                       return false;
-               if (r == 1)
-                       si_pm4_bind_state(sctx, hs, 
sctx->tcs_shader.current->pm4);
-
-               /* VS can be bound as LS, ES, or VS. */
-               r = si_update_scratch_buffer(sctx, sctx->vs_shader.current);
-               if (r < 0)
-                       return false;
-               if (r == 1) {
-                       if (sctx->tes_shader.current)
-                               si_pm4_bind_state(sctx, ls, 
sctx->vs_shader.current->pm4);
-                       else if (sctx->gs_shader.current)
-                               si_pm4_bind_state(sctx, es, 
sctx->vs_shader.current->pm4);
-                       else
-                               si_pm4_bind_state(sctx, vs, 
sctx->vs_shader.current->pm4);
-               }
-
-               /* TES can be bound as ES or VS. */
-               r = si_update_scratch_buffer(sctx, sctx->tes_shader.current);
-               if (r < 0)
+               if (HAVE_LLVM <= 0x0309 &&
+                   !si_update_scratch_relocs(sctx))
                        return false;
-               if (r == 1) {
-                       if (sctx->gs_shader.current)
-                               si_pm4_bind_state(sctx, es, 
sctx->tes_shader.current->pm4);
-                       else
-                               si_pm4_bind_state(sctx, vs, 
sctx->tes_shader.current->pm4);
-               }
        }
 
        /* The LLVM shader backend should be reporting aligned scratch_sizes. */
        assert((scratch_needed_size & ~0x3FF) == scratch_needed_size &&
                "scratch size should already be aligned correctly.");
 
        spi_tmpring_size = S_0286E8_WAVES(sctx->scratch_waves) |
                           S_0286E8_WAVESIZE(scratch_bytes_per_wave >> 10);
        if (spi_tmpring_size != sctx->spi_tmpring_size) {
                sctx->spi_tmpring_size = spi_tmpring_size;
-- 
2.7.4

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH] radeonsi: pass the scratch buffer via user SGPRs on LLVM 4.0

Reply via email to