Re: [Mesa-dev] [PATCH 12/25] radeonsi: add VS prolog

Nicolai Hähnle Tue, 16 Feb 2016 08:06:03 -0800

On 15.02.2016 18:59, Marek Olšák wrote:

From: Marek Olšák <marek.ol...@amd.com>


This is disabled with use_monolithic_shaders = true.
---
  src/gallium/drivers/radeonsi/si_pipe.c   |  19 +++
  src/gallium/drivers/radeonsi/si_pipe.h   |   3 +
  src/gallium/drivers/radeonsi/si_shader.c | 236 ++++++++++++++++++++++++++++++-
  src/gallium/drivers/radeonsi/si_shader.h |   9 ++
  4 files changed, 266 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeonsi/si_pipe.c 
b/src/gallium/drivers/radeonsi/si_pipe.c
index 448fe88..7ce9570 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -22,6 +22,7 @@
   */

  #include "si_pipe.h"
+#include "si_shader.h"
  #include "si_public.h"
  #include "sid.h"

@@ -536,6 +537,11 @@ static int si_get_shader_param(struct pipe_screen* 
pscreen, unsigned shader, enu
  static void si_destroy_screen(struct pipe_screen* pscreen)
  {
        struct si_screen *sscreen = (struct si_screen *)pscreen;
+       struct si_shader_part *parts[] = {
+               sscreen->vs_prologs,
+               /* this will be filled with other shader parts */
+       };
+       unsigned i;

        if (!sscreen)
                return;
@@ -543,6 +549,18 @@ static void si_destroy_screen(struct pipe_screen* pscreen)
        if (!sscreen->b.ws->unref(sscreen->b.ws))
                return;

+       /* Free shader parts. */
+       for (i = 0; i < ARRAY_SIZE(parts); i++) {
+               while (parts[i]) {
+                       struct si_shader_part *part = parts[i];
+
+                       parts[i] = part->next;
+                       radeon_shader_binary_clean(&part->binary);
+                       FREE(part);
+               }
+       }
+       pipe_mutex_destroy(sscreen->shader_parts_mutex);
+
        r600_destroy_common_screen(&sscreen->b);
  }

@@ -600,6 +618,7 @@ struct pipe_screen *radeonsi_screen_create(struct 
radeon_winsys *ws)

        sscreen->b.has_cp_dma = true;
        sscreen->b.has_streamout = true;
+       pipe_mutex_init(sscreen->shader_parts_mutex);
        sscreen->use_monolithic_shaders = true;

        if (debug_get_bool_option("RADEON_DUMP_SHADERS", FALSE))
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index 2a2455c..f4bafc2 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -87,6 +87,9 @@ struct si_screen {

        /* Whether shaders are monolithic (1-part) or separate (3-part). */
        bool                            use_monolithic_shaders;
+
+       pipe_mutex                      shader_parts_mutex;
+       struct si_shader_part           *vs_prologs;
  };

  struct si_blend_color {
diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
b/src/gallium/drivers/radeonsi/si_shader.c
index b74ed1e..fbb8394 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -83,6 +83,7 @@ struct si_shader_context
        int param_rel_auto_id;
        int param_vs_prim_id;
        int param_instance_id;
+       int param_vertex_index0;
        int param_tes_u;
        int param_tes_v;
        int param_tes_rel_patch_id;
@@ -432,7 +433,11 @@ static void declare_input_vs(
        /* Build the attribute offset */
        attribute_offset = lp_build_const_int32(gallivm, 0);

-       if (divisor) {
+       if (!ctx->is_monolithic) {
+               buffer_index = LLVMGetParam(radeon_bld->main_fn,
+                                           ctx->param_vertex_index0 +
+                                           input_index);
+       } else if (divisor) {
                /* Build index from instance ID, start instance and divisor */
                ctx->shader->uses_instanceid = true;
                buffer_index = get_instance_index_for_fetch(&ctx->radeon_bld,
@@ -3711,6 +3716,15 @@ static void create_function(struct si_shader_context 
*ctx)
                params[ctx->param_rel_auto_id = num_params++] = ctx->i32;
                params[ctx->param_vs_prim_id = num_params++] = ctx->i32;
                params[ctx->param_instance_id = num_params++] = ctx->i32;
+
+               if (!ctx->is_monolithic &&
+                   !ctx->is_gs_copy_shader) {
+                       /* Vertex load indices. */
+                       ctx->param_vertex_index0 = num_params;
+
+                       for (i = 0; i < shader->selector->info.num_inputs; i++)
+                               params[num_params++] = ctx->i32;
+               }
                break;

        case TGSI_PROCESSOR_TESS_CTRL:
@@ -4678,6 +4692,203 @@ out:
        return r;
  }

+/**
+ * Create, compile and return a shader part (prolog or epilog).
+ *
+ * \param sscreen      screen
+ * \param list         list of shader parts of the same category
+ * \param key          shader part key
+ * \param tm           LLVM target machine
+ * \param debug                debug callback
+ * \param compile      the callback responsible for compilation
+ * \return             non-NULL on success
+ */
+static struct si_shader_part *
+si_get_shader_part(struct si_screen *sscreen,
+                  struct si_shader_part **list,
+                  union si_shader_part_key *key,
+                  LLVMTargetMachineRef tm,
+                  struct pipe_debug_callback *debug,
+                  bool (*compile)(struct si_screen *,
+                                  LLVMTargetMachineRef,
+                                  struct pipe_debug_callback *,
+                                  struct si_shader_part *))
+{
+       struct si_shader_part *result;
+
+       pipe_mutex_lock(sscreen->shader_parts_mutex);
+
+       /* Find existing. */
+       for (result = *list; result; result = result->next)
+               if (memcmp(&result->key, key, sizeof(*key)) == 0) {
+                       pipe_mutex_unlock(sscreen->shader_parts_mutex);
+                       return result;
+               }

Bike-shedding: I generally prefer braces around multi-line blocks evenwhen they consist only of a single statement.

+
+       /* Compile a new one. */
+       result = CALLOC_STRUCT(si_shader_part);
+       result->key = *key;
+       if (!compile(sscreen, tm, debug, result)) {
+               FREE(result);
+               pipe_mutex_unlock(sscreen->shader_parts_mutex);
+               return NULL;
+       }
+
+       result->next = *list;
+       *list = result;
+       pipe_mutex_unlock(sscreen->shader_parts_mutex);
+       return result;
+}
+
+/**
+ * Create a vertex shader prolog.
+ *
+ * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
+ * All inputs are returned unmodified. The vertex load indices are
+ * stored after them, which will used by the API VS for fetching inputs.
+ *
+ * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
+ *   input_v0,
+ *   input_v1,
+ *   input_v2,
+ *   input_v3,
+ *   (VertexID + BaseVertex),
+ *   (InstanceID + StartInstance),
+ *   (InstanceID / 2 + StartInstance)
+ */
+static bool si_compile_vs_prolog(struct si_screen *sscreen,
+                                LLVMTargetMachineRef tm,
+                                struct pipe_debug_callback *debug,
+                                struct si_shader_part *out)
+{
+       union si_shader_part_key *key = &out->key;
+       struct si_shader shader = {};
+       struct si_shader_context ctx;
+       struct gallivm_state *gallivm = &ctx.radeon_bld.gallivm;
+       LLVMTypeRef *params, *returns;
+       LLVMValueRef ret, func;
+       int last_sgpr, num_params, num_returns, i;
+       bool status = true;
+
+       si_init_shader_ctx(&ctx, sscreen, &shader, tm, NULL);
+       ctx.type = TGSI_PROCESSOR_VERTEX;
+       ctx.param_vertex_id = key->vs_prolog.num_input_sgprs;
+       ctx.param_instance_id = key->vs_prolog.num_input_sgprs + 3;
+
+       /* 4 preloaded VGPRs + vertex load indices as prolog outputs */
+       params = alloca((key->vs_prolog.num_input_sgprs + 4) *
+                       sizeof(LLVMTypeRef));
+       returns = alloca((key->vs_prolog.num_input_sgprs + 4 +
+                         key->vs_prolog.last_input + 1) *
+                        sizeof(LLVMTypeRef));
+       num_params = 0;
+       num_returns = 0;
+
+       /* Declare input and output SGPRs. */
+       num_params = 0;
+       for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
+               params[num_params++] = ctx.i32;
+               returns[num_returns++] = ctx.i32;
+       }
+       last_sgpr = num_params - 1;
+
+       /* 4 preloaded VGPRs (outputs must be floats) */
+       for (i = 0; i < 4; i++) {
+               params[num_params++] = ctx.i32;
+               returns[num_returns++] = ctx.f32;
+       }
+
+       /* Vertex load indices. */
+       for (i = 0; i <= key->vs_prolog.last_input; i++)
+               returns[num_returns++] = ctx.f32;
+
+       /* Create the function. */
+       si_create_function(&ctx, returns, num_returns, params,
+                          num_params, -1, last_sgpr);
+       func = ctx.radeon_bld.main_fn;
+
+       /* Copy inputs to outputs. This should be no-op, as the registers match,
+        * but it will prevent the compiler from overwriting them 
unintentionally.
+        */
+       ret = ctx.return_value;
+       for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
+               LLVMValueRef p = LLVMGetParam(func, i);
+               ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
+       }
+       for (i = num_params - 4; i < num_params; i++) {
+               LLVMValueRef p = LLVMGetParam(func, i);
+               p = LLVMBuildBitCast(gallivm->builder, p, ctx.f32, "");
+               ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
+       }
+
+       /* Compute vertex load indices from instance divisors. */
+       for (i = 0; i <= key->vs_prolog.last_input; i++) {
+               unsigned divisor = key->vs_prolog.states.instance_divisors[i];
+               LLVMValueRef index;
+
+               if (divisor) {
+                       /* InstanceID / Divisor + StartInstance */
+                       index = get_instance_index_for_fetch(&ctx.radeon_bld,
+                                                            
SI_SGPR_START_INSTANCE,
+                                                            divisor);
+               } else {
+                       /* VertexID + BaseVertex */
+                       index = LLVMBuildAdd(gallivm->builder,
+                                            LLVMGetParam(func, 
ctx.param_vertex_id),
+                                            LLVMGetParam(func, SI_SGPR_BASE_VERTEX), 
"");
+               }
+
+               index = LLVMBuildBitCast(gallivm->builder, index, ctx.f32, "");
+               ret = LLVMBuildInsertValue(gallivm->builder, ret, index,
+                                          num_params++, "");
+       }
+
+       /* Compile. */
+       LLVMBuildRet(gallivm->builder, ret);
+       radeon_llvm_finalize_module(&ctx.radeon_bld);
+
+       if (si_compile_llvm(sscreen, &out->binary, &out->config, tm,
+                           gallivm->module, debug, ctx.type,
+                           "Vertex Shader Prolog"))
+               status = false;
+
+       radeon_llvm_dispose(&ctx.radeon_bld);
+       return status;
+}
+
+static bool si_shader_select_vs_parts(struct si_screen *sscreen,
+                                     LLVMTargetMachineRef tm,
+                                     struct si_shader *shader,
+                                     struct pipe_debug_callback *debug)
+{
+       struct tgsi_shader_info *info = &shader->selector->info;
+       union si_shader_part_key prolog_key;
+       unsigned i;
+
+       /* Get the prolog. */
+       memset(&prolog_key, 0, sizeof(prolog_key));
+       prolog_key.vs_prolog.states = shader->key.vs.prolog;
+       prolog_key.vs_prolog.num_input_sgprs = shader->num_input_sgprs;
+       prolog_key.vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
+
+       /* The prolog is a no-op if these are no inputs. */


s/these/there/

Nicolai

+       if (info->num_inputs) {
+               shader->prolog =
+                       si_get_shader_part(sscreen, &sscreen->vs_prologs,
+                                          &prolog_key, tm, debug,
+                                          si_compile_vs_prolog);
+               if (!shader->prolog)
+                       return false;
+       }
+
+       /* Set the instanceID flag. */
+       for (i = 0; i < info->num_inputs; i++)
+               if (prolog_key.vs_prolog.states.instance_divisors[i])
+                       shader->uses_instanceid = true;
+
+       return true;
+}
+
  int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
                     struct si_shader *shader,
                     struct pipe_debug_callback *debug)
@@ -4690,6 +4901,29 @@ int si_shader_create(struct si_screen *sscreen, 
LLVMTargetMachineRef tm,
        if (r)
                return r;

+       if (!sscreen->use_monolithic_shaders) {
+               switch (shader->selector->type) {
+               case PIPE_SHADER_VERTEX:
+                       if (!si_shader_select_vs_parts(sscreen, tm, shader, 
debug))
+                               return -1;
+                       break;
+               }
+
+               /* Update SGPR and VGPR counts. */
+               if (shader->prolog) {
+                       shader->config.num_sgprs = 
MAX2(shader->config.num_sgprs,
+                                                       
shader->prolog->config.num_sgprs);
+                       shader->config.num_vgprs = 
MAX2(shader->config.num_vgprs,
+                                                       
shader->prolog->config.num_vgprs);
+               }
+               if (shader->epilog) {
+                       shader->config.num_sgprs = 
MAX2(shader->config.num_sgprs,
+                                                       
shader->epilog->config.num_sgprs);
+                       shader->config.num_vgprs = 
MAX2(shader->config.num_vgprs,
+                                                       
shader->epilog->config.num_vgprs);
+               }
+       }
+
        si_shader_dump(sscreen, shader, debug, 
shader->selector->info.processor);

        /* Upload. */
diff --git a/src/gallium/drivers/radeonsi/si_shader.h 
b/src/gallium/drivers/radeonsi/si_shader.h
index 66b3156..e3ba4c7 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -268,6 +268,14 @@ struct si_ps_epilog_bits {
        unsigned        clamp_color:1;
  };

+union si_shader_part_key {
+       struct {
+               struct si_vs_prolog_bits states;
+               unsigned        num_input_sgprs:5;
+               unsigned        last_input:4;
+       } vs_prolog;
+};
+
  union si_shader_key {
        struct {
                struct si_ps_prolog_bits prolog;
@@ -327,6 +335,7 @@ struct si_shader {

  struct si_shader_part {
        struct si_shader_part *next;
+       union si_shader_part_key key;
        struct radeon_shader_binary binary;
        struct si_shader_config config;
  };

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH 12/25] radeonsi: add VS prolog

Reply via email to