Mesa (main): turnip,ir3: Implement A7XX push consts load via preamble

GitLab Mirror Wed, 04 Oct 2023 09:30:47 -0700

Module: Mesa
Branch: main
Commit: a5f0f7d4b162c04878fb9d505d55ebdd05c5c773
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=a5f0f7d4b162c04878fb9d505d55ebdd05c5c773


Author: Danylo Piliaiev <[email protected]>
Date:   Tue Sep  5 18:24:03 2023 +0200

turnip,ir3: Implement A7XX push consts load via preamble

New push consts loading consist of:
- Push consts are set for the entire pipeline via HLSQ_SHARED_CONSTS_IMM
  array which could fit up to 256b of push consts.
- For each shader stage that uses push consts READ_IMM_SHARED_CONSTS
  should be set in HLSQ_*_CNTL, otherwise push consts may get overwritten
  by new push consts that are set after the draw.
- Push consts are loaded into consts reg file in a shader preamble via
  stsc at the very start of the preamble.

OPC_PUSH_CONSTS_LOAD_MACRO is used instead of directly translating NIR
intrinsic into stsc because: we don't want to teach legalize pass how
to set (ss) between stores and loads of consts reg file, don't want for
stsc to be reordered, etc.

Signed-off-by: Danylo Piliaiev <[email protected]>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25086>

---

 src/compiler/nir/nir_intrinsics.py                 |  4 ++
 src/freedreno/common/freedreno_dev_info.h          |  4 ++
 src/freedreno/common/freedreno_devices.py          |  4 +-
 src/freedreno/ir3/disasm-a3xx.c                    |  1 +
 src/freedreno/ir3/instr-a3xx.h                     |  7 +-
 src/freedreno/ir3/ir3.h                            |  5 ++
 src/freedreno/ir3/ir3_compiler.c                   |  2 +-
 src/freedreno/ir3/ir3_compiler.h                   |  2 +
 src/freedreno/ir3/ir3_compiler_nir.c               | 10 +++
 src/freedreno/ir3/ir3_legalize.c                   | 81 +++++++++++++++++-----
 src/freedreno/ir3/ir3_nir.c                        |  3 +
 src/freedreno/ir3/ir3_nir.h                        |  2 +
 .../ir3/ir3_nir_lower_push_consts_to_preamble.c    | 28 ++++++++
 src/freedreno/ir3/ir3_postsched.c                  |  4 ++
 src/freedreno/ir3/ir3_print.c                      |  8 ++-
 src/freedreno/ir3/ir3_sched.c                      |  4 ++
 src/freedreno/ir3/ir3_shader.h                     |  4 ++
 src/freedreno/ir3/meson.build                      |  1 +
 src/freedreno/vulkan/tu_cmd_buffer.cc              | 68 ++++++++++++------
 src/freedreno/vulkan/tu_pipeline.cc                | 19 +++--
 src/freedreno/vulkan/tu_shader.cc                  |  2 +
 21 files changed, 215 insertions(+), 48 deletions(-)

diff --git a/src/compiler/nir/nir_intrinsics.py 
b/src/compiler/nir/nir_intrinsics.py
index 8bc2226ab69..4bc57a7613a 100644
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@@ -1329,6 +1329,10 @@ store("uniform_ir3", [], indices=[BASE])
 # vec4's.
 intrinsic("copy_ubo_to_uniform_ir3", [1, 1], indices=[BASE, RANGE])
 
+# IR3-specific intrinsic for stsc. Loads from push consts to constant file
+# Should be used in the shader preamble.
+intrinsic("copy_push_const_to_uniform_ir3", [1], indices=[BASE, RANGE])
+
 # Intrinsics used by the Midgard/Bifrost blend pipeline. These are defined
 # within a blend shader to read/write the raw value from the tile buffer,
 # without applying any format conversion in the process. If the shader needs
diff --git a/src/freedreno/common/freedreno_dev_info.h 
b/src/freedreno/common/freedreno_dev_info.h
index 5dda81a24bf..4f2af42a8d4 100644
--- a/src/freedreno/common/freedreno_dev_info.h
+++ b/src/freedreno/common/freedreno_dev_info.h
@@ -191,6 +191,10 @@ struct fd_dev_info {
    } a6xx;
 
    struct {
+      /* stsc may need to be done twice for the same range to workaround
+       * _something_, observed in blob's disassembly.
+       */
+      bool stsc_duplication_quirk;
    } a7xx;
 };
 
diff --git a/src/freedreno/common/freedreno_devices.py 
b/src/freedreno/common/freedreno_devices.py
index 04695d30cff..7e9a3d73bd0 100644
--- a/src/freedreno/common/freedreno_devices.py
+++ b/src/freedreno/common/freedreno_devices.py
@@ -704,7 +704,9 @@ add_gpus([
 
 a7xx_730 = A7XXProps()
 
-a7xx_740 = A7XXProps()
+a7xx_740 = A7XXProps(
+        stsc_duplication_quirk = True,
+    )
 
 add_gpus([
         GPUId(chip_id=0x07030001, name="FD730"), # KGSL, no speedbin data
diff --git a/src/freedreno/ir3/disasm-a3xx.c b/src/freedreno/ir3/disasm-a3xx.c
index 489e45b038b..f842e7fdb1b 100644
--- a/src/freedreno/ir3/disasm-a3xx.c
+++ b/src/freedreno/ir3/disasm-a3xx.c
@@ -194,6 +194,7 @@ static const struct opc_info {
    OPC(1, OPC_SWZ_SHARED_MACRO, swz_shared.macro),
    OPC(1, OPC_SCAN_MACRO, scan.macro),
    OPC(1, OPC_SHPS_MACRO, shps.macro),
+   OPC(1, OPC_PUSH_CONSTS_LOAD_MACRO, push_consts_load.macro),
 
    /* category 2: */
    OPC(2, OPC_ADD_F,        add.f),
diff --git a/src/freedreno/ir3/instr-a3xx.h b/src/freedreno/ir3/instr-a3xx.h
index e9eaebc937c..777cfeb7113 100644
--- a/src/freedreno/ir3/instr-a3xx.h
+++ b/src/freedreno/ir3/instr-a3xx.h
@@ -131,6 +131,11 @@ typedef enum {
    /* Macros that expand to a loop */
    OPC_SCAN_MACRO      = _OPC(1, 58),
 
+   /* Macros that expand to an stsc at the start of the preamble.
+    * It loads into const file and should not be optimized in any way.
+    */
+   OPC_PUSH_CONSTS_LOAD_MACRO = _OPC(1, 59),
+
    /* category 2: */
    OPC_ADD_F           = _OPC(2, 0),
    OPC_MIN_F           = _OPC(2, 1),
@@ -406,7 +411,7 @@ typedef enum {
    /*
     * A manually encoded opcode
     */
-   OPC_META_RAW = _OPC(OPC_META, 7)
+   OPC_META_RAW = _OPC(OPC_META, 7),
 } opc_t;
 /* clang-format on */
 
diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h
index 221080ba261..7eb8ca36209 100644
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@@ -445,6 +445,10 @@ struct ir3_instruction {
           */
          gl_system_value sysval;
       } input;
+      struct {
+         unsigned src_base, src_size;
+         unsigned dst_base;
+      } push_consts;
       struct {
          uint64_t value;
       } raw;
@@ -2485,6 +2489,7 @@ INSTR1(QUAD_SHUFFLE_VERT)
 INSTR1(QUAD_SHUFFLE_DIAG)
 INSTR2NODST(LDC_K)
 INSTR2NODST(STC)
+INSTR2NODST(STSC)
 #ifndef GPU
 #elif GPU >= 600
 INSTR3NODST(STIB);
diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c
index eb0001a0210..52d04fdd7d9 100644
--- a/src/freedreno/ir3/ir3_compiler.c
+++ b/src/freedreno/ir3/ir3_compiler.c
@@ -202,13 +202,13 @@ ir3_compiler_create(struct fd_device *dev, const struct 
fd_dev_id *dev_id,
          compiler->shared_consts_size = 8;
          compiler->geom_shared_consts_size_quirk = 16;
       } else {
-         /* A7XX TODO: properly use new shared consts mechanism */
          compiler->shared_consts_base_offset = -1;
          compiler->shared_consts_size = 0;
          compiler->geom_shared_consts_size_quirk = 0;
       }
 
       compiler->has_fs_tex_prefetch = dev_info->a6xx.has_fs_tex_prefetch;
+      compiler->stsc_duplication_quirk = dev_info->a7xx.stsc_duplication_quirk;
    } else {
       compiler->max_const_pipeline = 512;
       compiler->max_const_geom = 512;
diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h
index 892d3627004..d51ef5519cc 100644
--- a/src/freedreno/ir3/ir3_compiler.h
+++ b/src/freedreno/ir3/ir3_compiler.h
@@ -245,6 +245,8 @@ struct ir3_compiler {
    uint64_t geom_shared_consts_size_quirk;
 
    bool has_fs_tex_prefetch;
+
+   bool stsc_duplication_quirk;
 };
 
 void ir3_compiler_destroy(struct ir3_compiler *compiler);
diff --git a/src/freedreno/ir3/ir3_compiler_nir.c 
b/src/freedreno/ir3/ir3_compiler_nir.c
index b8cd9848f25..54cc6ed3202 100644
--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@@ -2678,6 +2678,16 @@ emit_intrinsic(struct ir3_context *ctx, 
nir_intrinsic_instr *intr)
       array_insert(b, b->keeps, stc);
       break;
    }
+   case nir_intrinsic_copy_push_const_to_uniform_ir3: {
+      struct ir3_instruction *load =
+         ir3_instr_create(ctx->block, OPC_PUSH_CONSTS_LOAD_MACRO, 0, 0);
+      array_insert(b, b->keeps, load);
+
+      load->push_consts.dst_base = nir_src_as_uint(intr->src[0]);
+      load->push_consts.src_base = nir_intrinsic_base(intr);
+      load->push_consts.src_size = nir_intrinsic_range(intr);
+      break;
+   }
    default:
       ir3_context_error(ctx, "Unhandled intrinsic type: %s\n",
                         nir_intrinsic_infos[intr->intrinsic].name);
diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c
index 0626113d7a0..604e5bf2962 100644
--- a/src/freedreno/ir3/ir3_legalize.c
+++ b/src/freedreno/ir3/ir3_legalize.c
@@ -58,6 +58,7 @@ struct ir3_legalize_state {
    regmask_t needs_ss;
    regmask_t needs_ss_war; /* write after read */
    regmask_t needs_sy;
+   bool needs_ss_for_const;
 };
 
 struct ir3_legalize_block_data {
@@ -65,6 +66,17 @@ struct ir3_legalize_block_data {
    struct ir3_legalize_state state;
 };
 
+static inline void
+apply_ss(struct ir3_instruction *instr,
+         struct ir3_legalize_state *state,
+         bool mergedregs)
+{
+   instr->flags |= IR3_INSTR_SS;
+   regmask_init(&state->needs_ss_war, mergedregs);
+   regmask_init(&state->needs_ss, mergedregs);
+   state->needs_ss_for_const = false;
+}
+
 /* We want to evaluate each block from the position of any other
  * predecessor block, in order that the flags set are the union of
  * all possible program paths.
@@ -109,6 +121,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct 
ir3_block *block)
       regmask_or(&state->needs_ss_war, &state->needs_ss_war,
                  &pstate->needs_ss_war);
       regmask_or(&state->needs_sy, &state->needs_sy, &pstate->needs_sy);
+      state->needs_ss_for_const |= pstate->needs_ss_for_const;
    }
 
    /* We need to take phsyical-only edges into account when tracking shared
@@ -162,17 +175,15 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct 
ir3_block *block)
       }
 
       if ((last_n && is_barrier(last_n)) || n->opc == OPC_SHPE) {
-         n->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
-         last_input_needs_ss = false;
-         regmask_init(&state->needs_ss_war, mergedregs);
-         regmask_init(&state->needs_ss, mergedregs);
+         apply_ss(n, state, mergedregs);
+
+         n->flags |= IR3_INSTR_SY;
          regmask_init(&state->needs_sy, mergedregs);
+         last_input_needs_ss = false;
       }
 
       if (last_n && (last_n->opc == OPC_PREDT)) {
-         n->flags |= IR3_INSTR_SS;
-         regmask_init(&state->needs_ss_war, mergedregs);
-         regmask_init(&state->needs_ss, mergedregs);
+         apply_ss(n, state, mergedregs);
       }
 
       /* NOTE: consider dst register too.. it could happen that
@@ -195,25 +206,24 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct 
ir3_block *block)
              * some tests for both this and (sy)..
              */
             if (regmask_get(&state->needs_ss, reg)) {
-               n->flags |= IR3_INSTR_SS;
+               apply_ss(n, state, mergedregs);
                last_input_needs_ss = false;
-               regmask_init(&state->needs_ss_war, mergedregs);
-               regmask_init(&state->needs_ss, mergedregs);
             }
 
             if (regmask_get(&state->needs_sy, reg)) {
                n->flags |= IR3_INSTR_SY;
                regmask_init(&state->needs_sy, mergedregs);
             }
+         } else if ((reg->flags & IR3_REG_CONST) && state->needs_ss_for_const) 
{
+            apply_ss(n, state, mergedregs);
+            last_input_needs_ss = false;
          }
       }
 
       foreach_dst (reg, n) {
          if (regmask_get(&state->needs_ss_war, reg)) {
-            n->flags |= IR3_INSTR_SS;
+            apply_ss(n, state, mergedregs);
             last_input_needs_ss = false;
-            regmask_init(&state->needs_ss_war, mergedregs);
-            regmask_init(&state->needs_ss, mergedregs);
          }
       }
 
@@ -230,7 +240,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct 
ir3_block *block)
       }
 
       /* need to be able to set (ss) on first instruction: */
-      if (list_is_empty(&block->instr_list) && (opc_cat(n->opc) >= 5))
+      if (list_is_empty(&block->instr_list) && (opc_cat(n->opc) >= 5) && 
!is_meta(n))
          ir3_NOP(block);
 
       if (ctx->compiler->samgq_workaround &&
@@ -281,6 +291,8 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct 
ir3_block *block)
          } else {
             regmask_set(&state->needs_ss, n->dsts[0]);
          }
+      } else if (n->opc == OPC_PUSH_CONSTS_LOAD_MACRO) {
+         state->needs_ss_for_const = true;
       }
 
       if (is_ssbo(n->opc) || is_global_a3xx_atomic(n->opc) ||
@@ -324,9 +336,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct 
ir3_block *block)
 
             last_input->dsts[0]->flags |= IR3_REG_EI;
             if (last_input_needs_ss) {
-               last_input->flags |= IR3_INSTR_SS;
-               regmask_init(&state->needs_ss_war, mergedregs);
-               regmask_init(&state->needs_ss, mergedregs);
+               apply_ss(last_input, state, mergedregs);
             }
          }
       }
@@ -407,6 +417,36 @@ apply_fine_deriv_macro(struct ir3_legalize_ctx *ctx, 
struct ir3_block *block)
    return true;
 }
 
+static void
+apply_push_consts_load_macro(struct ir3_legalize_ctx *ctx,
+                             struct ir3_block *block)
+{
+   foreach_instr (n, &block->instr_list) {
+      if (n->opc == OPC_PUSH_CONSTS_LOAD_MACRO) {
+         struct ir3_instruction *stsc = ir3_instr_create(block, OPC_STSC, 0, 
2);
+         ir3_instr_move_after(stsc, n);
+         ir3_src_create(stsc, 0, IR3_REG_IMMED)->iim_val =
+            n->push_consts.dst_base;
+         ir3_src_create(stsc, 0, IR3_REG_IMMED)->iim_val =
+            n->push_consts.src_base;
+         stsc->cat6.iim_val = n->push_consts.src_size;
+         stsc->cat6.type = TYPE_U32;
+
+         if (ctx->compiler->stsc_duplication_quirk) {
+            struct ir3_instruction *nop = ir3_NOP(block);
+            ir3_instr_move_after(nop, stsc);
+            nop->flags |= IR3_INSTR_SS;
+            ir3_instr_move_after(ir3_instr_clone(stsc), nop);
+         }
+
+         list_delinit(&n->node);
+         break;
+      } else if (!is_meta(n)) {
+         break;
+      }
+   }
+}
+
 /* NOTE: branch instructions are always the last instruction(s)
  * in the block.  We take advantage of this as we resolve the
  * branches, since "if (foo) break;" constructs turn into
@@ -1180,6 +1220,13 @@ ir3_legalize(struct ir3 *ir, struct ir3_shader_variant 
*so, int *max_bary)
       progress |= apply_fine_deriv_macro(ctx, block);
    }
 
+   foreach_block (block, &ir->block_list) {
+      if (block->brtype == IR3_BRANCH_GETONE) {
+         apply_push_consts_load_macro(ctx, block->successors[0]);
+         break;
+      }
+   }
+
    nop_sched(ir, so);
 
    while (opt_jump(ir))
diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c
index 2598064af7e..21b3576ec7b 100644
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -747,6 +747,9 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, 
nir_shader *s)
 
    progress |= OPT(s, ir3_nir_lower_ubo_loads, so);
 
+   if (so->shader_options.push_consts_type == IR3_PUSH_CONSTS_SHARED_PREAMBLE)
+      progress |= OPT(s, ir3_nir_lower_push_consts_to_preamble, so);
+
    progress |= OPT(s, ir3_nir_lower_preamble, so);
 
    OPT_V(s, nir_lower_amul, ir3_glsl_type_size);
diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h
index b8cf29ba1f4..de6b70e7877 100644
--- a/src/freedreno/ir3/ir3_nir.h
+++ b/src/freedreno/ir3/ir3_nir.h
@@ -40,6 +40,8 @@ bool ir3_nir_lower_imul(nir_shader *shader);
 bool ir3_nir_lower_io_offsets(nir_shader *shader);
 bool ir3_nir_lower_load_barycentric_at_sample(nir_shader *shader);
 bool ir3_nir_lower_load_barycentric_at_offset(nir_shader *shader);
+bool ir3_nir_lower_push_consts_to_preamble(nir_shader *nir,
+                                           struct ir3_shader_variant *v);
 bool ir3_nir_move_varying_inputs(nir_shader *shader);
 int ir3_nir_coord_offset(nir_def *ssa);
 bool ir3_nir_lower_tex_prefetch(nir_shader *shader);
diff --git a/src/freedreno/ir3/ir3_nir_lower_push_consts_to_preamble.c 
b/src/freedreno/ir3/ir3_nir_lower_push_consts_to_preamble.c
new file mode 100644
index 00000000000..056edb70d83
--- /dev/null
+++ b/src/freedreno/ir3/ir3_nir_lower_push_consts_to_preamble.c
@@ -0,0 +1,28 @@
+/*
+ * Copyright © 2023 Igalia S.L.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "compiler/nir/nir.h"
+#include "compiler/nir/nir_builder.h"
+#include "util/u_math.h"
+#include "ir3_compiler.h"
+#include "ir3_nir.h"
+
+bool
+ir3_nir_lower_push_consts_to_preamble(nir_shader *nir,
+                                      struct ir3_shader_variant *v)
+{
+   nir_function_impl *preamble = nir_shader_get_preamble(nir);
+   nir_builder _b = nir_builder_at(nir_before_impl(preamble));
+   nir_builder *b = &_b;
+
+   nir_copy_push_const_to_uniform_ir3(
+      b, nir_imm_int(b, 0), .base = v->shader_options.push_consts_base,
+      .range = v->shader_options.push_consts_dwords);
+
+   nir_foreach_function_impl(impl, nir) {
+      nir_metadata_preserve(impl, nir_metadata_none);
+   }
+   return true;
+}
diff --git a/src/freedreno/ir3/ir3_postsched.c 
b/src/freedreno/ir3/ir3_postsched.c
index 97c05e0e336..29bf3a64847 100644
--- a/src/freedreno/ir3/ir3_postsched.c
+++ b/src/freedreno/ir3/ir3_postsched.c
@@ -691,6 +691,10 @@ sched_block(struct ir3_postsched_ctx *ctx, struct 
ir3_block *block)
       if (instr->opc == OPC_META_TEX_PREFETCH)
          schedule(ctx, instr);
 
+   foreach_instr_safe (instr, &ctx->unscheduled_list)
+      if (instr->opc == OPC_PUSH_CONSTS_LOAD_MACRO)
+         schedule(ctx, instr);
+
    while (!list_is_empty(&ctx->unscheduled_list)) {
       struct ir3_instruction *instr = choose_instr(ctx);
 
diff --git a/src/freedreno/ir3/ir3_print.c b/src/freedreno/ir3/ir3_print.c
index e85513d3ea9..b1d87545118 100644
--- a/src/freedreno/ir3/ir3_print.c
+++ b/src/freedreno/ir3/ir3_print.c
@@ -181,7 +181,8 @@ print_instr_name(struct log_stream *stream, struct 
ir3_instruction *instr,
          }
       }
 
-      if (instr->opc != OPC_MOVMSK && instr->opc != OPC_SCAN_MACRO) {
+      if (instr->opc != OPC_MOVMSK && instr->opc != OPC_SCAN_MACRO &&
+          instr->opc != OPC_PUSH_CONSTS_LOAD_MACRO) {
          mesa_log_stream_printf(stream, ".%s%s",
                                 type_name(instr->cat1.src_type),
                                 type_name(instr->cat1.dst_type));
@@ -405,6 +406,11 @@ print_instr(struct log_stream *stream, struct 
ir3_instruction *instr, int lvl)
       mesa_log_stream_printf(stream, ", tex=%d, samp=%d, input_offset=%d",
                              instr->prefetch.tex, instr->prefetch.samp,
                              instr->prefetch.input_offset);
+   } else if (instr->opc == OPC_PUSH_CONSTS_LOAD_MACRO) {
+      mesa_log_stream_printf(
+         stream, " dst_offset=%d, src_offset = %d, src_size = %d",
+         instr->push_consts.dst_base, instr->push_consts.src_base,
+         instr->push_consts.src_size);
    }
 
    if (is_flow(instr) && instr->cat0.target) {
diff --git a/src/freedreno/ir3/ir3_sched.c b/src/freedreno/ir3/ir3_sched.c
index 814920d2d1f..31f709b52d8 100644
--- a/src/freedreno/ir3/ir3_sched.c
+++ b/src/freedreno/ir3/ir3_sched.c
@@ -1235,6 +1235,10 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block 
*block)
       if (instr->opc == OPC_META_TEX_PREFETCH)
          schedule(ctx, instr);
 
+   foreach_instr_safe (instr, &ctx->unscheduled_list)
+      if (instr->opc == OPC_PUSH_CONSTS_LOAD_MACRO)
+         schedule(ctx, instr);
+
    while (!list_is_empty(&ctx->unscheduled_list)) {
       struct ir3_sched_notes notes = {0};
       struct ir3_instruction *instr;
diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h
index 345d2af7104..d2686fcb104 100644
--- a/src/freedreno/ir3/ir3_shader.h
+++ b/src/freedreno/ir3/ir3_shader.h
@@ -151,6 +151,7 @@ enum ir3_push_consts_type {
    IR3_PUSH_CONSTS_NONE,
    IR3_PUSH_CONSTS_PER_STAGE,
    IR3_PUSH_CONSTS_SHARED,
+   IR3_PUSH_CONSTS_SHARED_PREAMBLE,
 };
 
 /**
@@ -507,6 +508,9 @@ struct ir3_shader_options {
     */
    enum ir3_wavesize_option real_wavesize;
    enum ir3_push_consts_type push_consts_type;
+
+   uint32_t push_consts_base;
+   uint32_t push_consts_dwords;
 };
 
 /**
diff --git a/src/freedreno/ir3/meson.build b/src/freedreno/ir3/meson.build
index 0e051958805..954b0b88fdb 100644
--- a/src/freedreno/ir3/meson.build
+++ b/src/freedreno/ir3/meson.build
@@ -95,6 +95,7 @@ libfreedreno_ir3_files = files(
   'ir3_nir_lower_64b.c',
   'ir3_nir_lower_load_barycentric_at_sample.c',
   'ir3_nir_lower_load_barycentric_at_offset.c',
+  'ir3_nir_lower_push_consts_to_preamble.c',
   'ir3_nir_lower_io_offsets.c',
   'ir3_nir_lower_tess.c',
   'ir3_nir_lower_tex_prefetch.c',
diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc 
b/src/freedreno/vulkan/tu_cmd_buffer.cc
index ea48e4b7ac1..e83c7d1ddba 100644
--- a/src/freedreno/vulkan/tu_cmd_buffer.cc
+++ b/src/freedreno/vulkan/tu_cmd_buffer.cc
@@ -4256,9 +4256,10 @@ tu6_user_consts_size(const struct tu_const_state 
*const_state,
 {
    uint32_t dwords = 0;
 
-   if (const_state->push_consts.dwords > 0) {
+   if (const_state->push_consts.type == IR3_PUSH_CONSTS_PER_STAGE) {
       unsigned num_units = const_state->push_consts.dwords;
       dwords += 4 + num_units;
+      assert(num_units > 0);
    }
 
    dwords += 8 * const_state->num_inline_ubos;
@@ -4267,12 +4268,10 @@ tu6_user_consts_size(const struct tu_const_state 
*const_state,
 }
 
 static void
-tu6_emit_user_consts(struct tu_cs *cs,
-                     const struct tu_const_state *const_state,
-                     unsigned constlen,
-                     gl_shader_stage type,
-                     struct tu_descriptor_state *descriptors,
-                     uint32_t *push_constants)
+tu6_emit_per_stage_push_consts(struct tu_cs *cs,
+                               const struct tu_const_state *const_state,
+                               gl_shader_stage type,
+                               uint32_t *push_constants)
 {
    if (const_state->push_consts.type == IR3_PUSH_CONSTS_PER_STAGE) {
       unsigned num_units = const_state->push_consts.dwords;
@@ -4291,7 +4290,15 @@ tu6_emit_user_consts(struct tu_cs *cs,
       for (unsigned i = 0; i < num_units; i++)
          tu_cs_emit(cs, push_constants[i + offset]);
    }
+}
 
+static void
+tu6_emit_inline_ubo(struct tu_cs *cs,
+                    const struct tu_const_state *const_state,
+                    unsigned constlen,
+                    gl_shader_stage type,
+                    struct tu_descriptor_state *descriptors)
+{
    /* Emit loads of inline uniforms. These load directly from the uniform's
     * storage space inside the descriptor set.
     */
@@ -4349,6 +4356,18 @@ tu6_emit_shared_consts(struct tu_cs *cs,
    }
 }
 
+static void
+tu7_emit_shared_preamble_consts(
+   struct tu_cs *cs,
+   const struct tu_push_constant_range *shared_consts,
+   uint32_t *push_constants)
+{
+   tu_cs_emit_pkt4(cs, REG_A7XX_HLSQ_SHARED_CONSTS_IMM(shared_consts->lo),
+                   shared_consts->dwords);
+   tu_cs_emit_array(cs, push_constants + shared_consts->lo,
+                    shared_consts->dwords);
+}
+
 static uint32_t
 tu6_const_size(struct tu_cmd_buffer *cmd,
                const struct tu_push_constant_range *shared_consts,
@@ -4358,6 +4377,8 @@ tu6_const_size(struct tu_cmd_buffer *cmd,
 
    if (shared_consts->type == IR3_PUSH_CONSTS_SHARED) {
       dwords += shared_consts->dwords + 4;
+   } else if (shared_consts->type == IR3_PUSH_CONSTS_SHARED_PREAMBLE) {
+      dwords += shared_consts->dwords + 1;
    }
 
    if (compute) {
@@ -4372,8 +4393,7 @@ tu6_const_size(struct tu_cmd_buffer *cmd,
 }
 
 static struct tu_draw_state
-tu6_emit_consts(struct tu_cmd_buffer *cmd,
-                bool compute)
+tu_emit_consts(struct tu_cmd_buffer *cmd, bool compute)
 {
    uint32_t dwords = 0;
    const struct tu_push_constant_range *shared_consts =
@@ -4390,24 +4410,30 @@ tu6_emit_consts(struct tu_cmd_buffer *cmd,
 
    if (shared_consts->type == IR3_PUSH_CONSTS_SHARED) {
       tu6_emit_shared_consts(&cs, shared_consts, cmd->push_constants, compute);
+   } else if (shared_consts->type == IR3_PUSH_CONSTS_SHARED_PREAMBLE) {
+      tu7_emit_shared_preamble_consts(&cs, shared_consts, cmd->push_constants);
    }
 
    if (compute) {
-      tu6_emit_user_consts(&cs,
-                           
&cmd->state.shaders[MESA_SHADER_COMPUTE]->const_state,
-                           
cmd->state.shaders[MESA_SHADER_COMPUTE]->variant->constlen,
-                           MESA_SHADER_COMPUTE,
-                           tu_get_descriptors_state(cmd, 
VK_PIPELINE_BIND_POINT_COMPUTE),
-                           cmd->push_constants);
+      tu6_emit_per_stage_push_consts(
+         &cs, &cmd->state.shaders[MESA_SHADER_COMPUTE]->const_state,
+         MESA_SHADER_COMPUTE, cmd->push_constants);
+      tu6_emit_inline_ubo(
+         &cs, &cmd->state.shaders[MESA_SHADER_COMPUTE]->const_state,
+         cmd->state.shaders[MESA_SHADER_COMPUTE]->variant->constlen,
+         MESA_SHADER_COMPUTE,
+         tu_get_descriptors_state(cmd, VK_PIPELINE_BIND_POINT_COMPUTE));
    } else {
-      struct tu_descriptor_state *descriptors  =
+      struct tu_descriptor_state *descriptors =
          tu_get_descriptors_state(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS);
       for (uint32_t type = MESA_SHADER_VERTEX; type <= MESA_SHADER_FRAGMENT; 
type++) {
          const struct tu_program_descriptor_linkage *link =
             &cmd->state.program.link[type];
-         tu6_emit_user_consts(&cs, &link->tu_const_state, link->constlen,
-                              (gl_shader_stage) type,
-                              descriptors, cmd->push_constants);
+         tu6_emit_per_stage_push_consts(&cs, &link->tu_const_state,
+                                        (gl_shader_stage) type,
+                                        cmd->push_constants);
+         tu6_emit_inline_ubo(&cs, &link->tu_const_state, link->constlen,
+                             (gl_shader_stage) type, descriptors);
       }
    }
 
@@ -4751,7 +4777,7 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
    }
 
    if (dirty & TU_CMD_DIRTY_SHADER_CONSTS)
-      cmd->state.shader_const = tu6_emit_consts(cmd, false);
+      cmd->state.shader_const = tu_emit_consts(cmd, false);
 
    if (dirty & TU_CMD_DIRTY_DESC_SETS)
       tu6_emit_descriptor_sets<CHIP>(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS);
@@ -5502,7 +5528,7 @@ tu_dispatch(struct tu_cmd_buffer *cmd,
    tu_emit_cache_flush<CHIP>(cmd);
 
    /* note: no reason to have this in a separate IB */
-   tu_cs_emit_state_ib(cs, tu6_emit_consts(cmd, true));
+   tu_cs_emit_state_ib(cs, tu_emit_consts(cmd, true));
 
    tu_emit_compute_driver_params<CHIP>(cmd, cs, info);
 
diff --git a/src/freedreno/vulkan/tu_pipeline.cc 
b/src/freedreno/vulkan/tu_pipeline.cc
index 1725b82e0bb..342fd3b1e7a 100644
--- a/src/freedreno/vulkan/tu_pipeline.cc
+++ b/src/freedreno/vulkan/tu_pipeline.cc
@@ -318,7 +318,11 @@ tu_push_consts_type(const struct tu_pipeline_layout 
*layout,
    if (tu6_shared_constants_enable(layout, compiler)) {
       return IR3_PUSH_CONSTS_SHARED;
    } else {
-      return IR3_PUSH_CONSTS_PER_STAGE;
+      if (compiler->gen >= 7) {
+         return IR3_PUSH_CONSTS_SHARED_PREAMBLE;
+      } else {
+         return IR3_PUSH_CONSTS_PER_STAGE;
+      }
    }
 }
 
@@ -385,7 +389,9 @@ tu6_emit_xs_config(struct tu_cs *cs,
 
    tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1);
    tu_cs_emit(cs, A6XX_HLSQ_VS_CNTL_CONSTLEN(xs->constlen) |
-                  A6XX_HLSQ_VS_CNTL_ENABLED);
+                     A6XX_HLSQ_VS_CNTL_ENABLED |
+                     COND(xs->shader_options.push_consts_type == 
IR3_PUSH_CONSTS_SHARED_PREAMBLE,
+                          A7XX_HLSQ_VS_CNTL_READ_IMM_SHARED_CONSTS));
 }
 TU_GENX(tu6_emit_xs_config);
 
@@ -2335,10 +2341,11 @@ tu_pipeline_builder_parse_shader_stages(struct 
tu_pipeline_builder *builder,
                               &pipeline->shaders[i]->const_state,
                               variants[i]);
 
-      if (pipeline->shaders[i]->const_state.push_consts.type ==
-          IR3_PUSH_CONSTS_SHARED) {
-         pipeline->program.shared_consts =
-            pipeline->shaders[i]->const_state.push_consts;
+      struct tu_push_constant_range *push_consts =
+         &pipeline->shaders[i]->const_state.push_consts;
+      if (push_consts->type == IR3_PUSH_CONSTS_SHARED ||
+          push_consts->type == IR3_PUSH_CONSTS_SHARED_PREAMBLE) {
+         pipeline->program.shared_consts = *push_consts;
       }
    }
 
diff --git a/src/freedreno/vulkan/tu_shader.cc 
b/src/freedreno/vulkan/tu_shader.cc
index ff6b4293e3b..6745da7cd24 100644
--- a/src/freedreno/vulkan/tu_shader.cc
+++ b/src/freedreno/vulkan/tu_shader.cc
@@ -2286,6 +2286,8 @@ tu_shader_create(struct tu_device *dev,
       .api_wavesize = key->api_wavesize,
       .real_wavesize = key->real_wavesize,
       .push_consts_type = shader->const_state.push_consts.type,
+      .push_consts_base = shader->const_state.push_consts.lo,
+      .push_consts_dwords = shader->const_state.push_consts.dwords,
    };
 
    struct ir3_shader *ir3_shader =

Mesa (main): turnip,ir3: Implement A7XX push consts load via preamble

Reply via email to